# Data Preparation for SPOC


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import datetime
import pathlib
import pandas as pd
from lxml import etree
import spacy
from spacy_lookup import Entity
papers_tei = pathlib.Path("/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_tei")

## Extracting Full-Text from TEI XML
[GROBID](https://grobid.readthedocs.io/en/latest/) extracts the full-text from the PDFs and saves the result in a
[TEI](https://tei-c.org/) XML document.

## WorMS Marine Species Dataframe


In [2]:
taxon = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/WoRMS_marineSpecies/taxon.txt', sep="\t")

In [3]:
species = taxon[['taxonID', 'scientificName', 'references']]
species = species.rename(columns={"references": "URL"})

In [4]:
species.to_json("../data/species.json")

## Location DataFrame

In [5]:
ca_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_CA.csv')

In [6]:
or_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_OR.csv')

In [7]:
wa_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_WA.csv')

In [8]:
locations = pd.concat([ca_locations, or_locations, wa_locations], ignore_index=True)

In [9]:
locations = locations[['FEATURE_ID', 'FEATURE_NAME', 'STATE_ALPHA', 'PRIM_LONG_DEC', 'PRIM_LAT_DEC']]
locations = locations.rename(columns={'STATE_ALPHA': 'STATE', 
                                      'PRIM_LONG_DEC': 'LONGITUDE', 
                                      'PRIM_LAT_DEC': 'LATITUDE'})

In [10]:
locations.to_json("../data/locations.json")

## Habitat DataFrame

In [11]:
import json
habitats = []
with open("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/habitats.jsonl") as fo:
    for row in fo.readlines():
        line = json.loads(row)
        habitats.append(line.get('pattern'))
habitats = pd.DataFrame(habitats, columns=['Habitat'])

In [12]:
habitats.to_json("../data/habitats.json")

## Load helper functions from `lib/etl.py` module

In [13]:
import sys, os
sys.path.append("../src")
import lib.etl as etl

In [14]:
# Uncomment these lines to load dataframes if you haven't generated the dataframes above
#species = pd.read_json('../data/species.json')
#locations = pd.read_json('../data/locations.json')
#habitats = pd.read_json('../data/habitats.json')

In [15]:
species_dict = dict(zip(species.taxonID, species.scientificName))
location_dict = dict(zip(locations.FEATURE_ID, locations.FEATURE_NAME))

## Construct spaCy Pipeline
Create a spaCy nlp pipeline from the existing [en_core_web_md](https://spacy.io/models/en#en_core_web_md) English pipeline .

In [16]:
# Iterate through species and location dictionary to turn values into lists
for key, val in species_dict.items():
    species_dict[key] = [val,]
for key, val in location_dict.items():
    location_dict[key] = [val,]

### Species, Locations, and Habitat Entity pipeline factories
This uses a forked version of [spacey lookup](https://github.com/sul-dlss-labs/spacy-lookup) that has been refactored to use spaCy 3.0.

In [17]:
from spacy.language import Language

@Language.factory(name='species_entity')
def create_species_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=species_dict, label='SPECIES')

@Language.factory(name='location_entity')
def create_location_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=location_dict, label='LOCATION')

@Language.factory(name='habitat_entity')
def create_habitat_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_list=list(habitats.Habitat), label='HABITAT')

In [18]:
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('species_entity')
nlp.add_pipe('location_entity')
nlp.add_pipe('habitat_entity')
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ff8f6a55e20>)

In [28]:
start = datetime.datetime.utcnow()
all_records = None
errors = []
print(f"Start conversion at {start}")
for i, tei_path in enumerate(papers_tei.iterdir()):
    try:
        records = etl.process_xml(tei_path.read_bytes(), tei_path.name, nlp)
    except:
        print(f"Error with {tei_path.name}")
        errors.append(tei_path)
        continue
    if all_records is None:
        all_records = records
    else:
        all_records = pd.concat([all_records, records], ignore_index=True)
    if not i%10 and i > 0:
        print(".", end="")
    if not i%25 and i > 0:
        print(f"{i}", end="")
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes. Total errors {len(errors)}")

Start conversion at 2021-03-22 19:46:52.720431
..25...50..75...100..125...150..175...200..225...250..275...300..325...350..375...400..425...450..475...500..525...550..575...600..625...650..675...700..725...750..775...800..825...850..875...900..925...950..975...1000..1025...1050..1075...1100..1125...1150..1175...1200..1225...1250..1275...1300..1325...1350..1375...1400..1425...1450..1475...1500..1525...1550..1575...1600..1625.Finished at 2021-03-22 19:51:58.256269, total time 5.083333333333333 minutes. Total errors 0


In [29]:
all_records.shape

(12151, 8)

In [30]:
all_records.head()

Unnamed: 0,Paper ID,Instance ID,Species,GBIF,Time,Place,Habitats,div_enum
0,fhl_2011_Brezicha_25959.tei.xml,urn:lsid:marinespecies.org:taxname:240762,Strongylocentrotus franciscanus,https://www.gbif.org/species/search?q=Strongyl...,2011-12-09,"[(1505051, Friday Harbor), (1508076, San Juan ...","[sub-tidal, close to shore, river, tree]",1
1,fhl_2011_Brezicha_25959.tei.xml,urn:lsid:marinespecies.org:taxname:431072,radius,https://www.gbif.org/species/search?q=radius&q...,2011-12-09,"[(1508076, San Juan Channel), (1507585, Porter)]",[sea],3
2,fhl_2011_Brezicha_25959.tei.xml,urn:lsid:marinespecies.org:taxname:711954,Parastichopus californicus,https://www.gbif.org/species/search?q=Parastic...,2011-12-09,"[(1507585, Porter)]","[sub-tidal, sea]",6
3,fhl_2011_Bockmon_26635.tei.xml,urn:lsid:marinespecies.org:taxname:603085,trossulus,https://www.gbif.org/species/search?q=trossulu...,,"[(1514917, Jackson Beach), (1507030, north end...",[],3
4,fhl_2011_Bockmon_26635.tei.xml,urn:lsid:marinespecies.org:taxname:112078,ammonia,https://www.gbif.org/species/search?q=ammonia&...,,[],[],11


In [31]:
all_records.to_json('../data/species-records.json')