1. Data Preparation for SPOC¶
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import datetime
import pathlib
import pandas as pd
from lxml import etree
import spacy
from spacy_lookup import Entity
papers_tei = pathlib.Path("/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_tei")
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-34de32e97c1c> in <module>
1 get_ipython().run_line_magic('reload_ext', 'autoreload')
2 get_ipython().run_line_magic('autoreload', '2')
----> 3 get_ipython().run_line_magic('matplotlib', 'inline')
5 import datetime
/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/interactiveshell.py in run_line_magic(self, magic_name, line, _stack_depth)
2346 kwargs['local_ns'] = self.get_local_scope(stack_depth)
2347 with self.builtin_trap:
-> 2348 result = fn(*args, **kwargs)
2349 return result
/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/decorator.py in fun(*args, **kw)
230 if not kwsyntax:
231 args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)
233 fun.__name__ = func.__name__
234 fun.__doc__ = func.__doc__
/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
189 if callable(arg):
/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/magics/pylab.py in matplotlib(self, line)
97 print("Available matplotlib backends: %s" % backends_list)
98 else:
---> 99 gui, backend = self.shell.enable_matplotlib(args.gui.lower() if isinstance(args.gui, str) else args.gui)
100 self._show_matplotlib_backend(args.gui, backend)
/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/ipykernel/zmqshell.py in enable_matplotlib(self, gui)
600 def enable_matplotlib(self, gui=None):
--> 601 gui, backend = super(ZMQInteractiveShell, self).enable_matplotlib(gui)
603 try:
/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/interactiveshell.py in enable_matplotlib(self, gui)
3515 """
3516 from IPython.core import pylabtools as pt
-> 3517 from matplotlib_inline.backend_inline import configure_inline_support
3518 gui, backend = pt.find_gui_and_backend(gui, self.pylab_gui_select)
/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/matplotlib_inline/backend_inline.py in <module>
4 # Distributed under the terms of the BSD 3-Clause License.
----> 6 import matplotlib
7 from matplotlib.backends.backend_agg import ( # noqa
8 new_figure_manager,
ModuleNotFoundError: No module named 'matplotlib'
1.1. Extracting Full-Text from TEI XML¶
GROBID extracts the full-text from the PDFs and saves the result in a TEI XML document.
1.2. WorMS Marine Species Dataframe¶
taxon = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/WoRMS_marineSpecies/taxon.txt', sep="\t")
species = taxon[['taxonID', 'scientificName', 'references']]
species = species.rename(columns={"references": "URL"})
1.3. Location DataFrame¶
ca_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_CA.csv')
or_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_OR.csv')
wa_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_WA.csv')
locations = pd.concat([ca_locations, or_locations, wa_locations], ignore_index=True)
locations = locations.rename(columns={'STATE_ALPHA': 'STATE',
1.4. Habitat DataFrame¶
import json
habitats = []
with open("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/habitats.jsonl") as fo:
for row in fo.readlines():
line = json.loads(row)
habitats = pd.DataFrame(habitats, columns=['Habitat'])
1.5. Load helper functions from lib/etl.py
import sys, os
import lib.etl as etl
# Uncomment these lines to load dataframes if you haven't generated the dataframes above
#species = pd.read_json('../data/species.json')
#locations = pd.read_json('../data/locations.json')
#habitats = pd.read_json('../data/habitats.json')
species_dict = dict(zip(species.taxonID, species.scientificName))
location_dict = dict(zip(locations.FEATURE_ID, locations.FEATURE_NAME))
1.6. Construct spaCy Pipeline¶
Create a spaCy nlp pipeline from the existing en_core_web_md English pipeline .
# Iterate through species and location dictionary to turn values into lists
for key, val in species_dict.items():
species_dict[key] = [val,]
for key, val in location_dict.items():
location_dict[key] = [val,]
1.6.1. Species, Locations, and Habitat Entity pipeline factories¶
This uses a forked version of spacey lookup that has been refactored to use spaCy 3.0.
from spacy.language import Language
def create_species_entity(nlp: Language, name: str):
return Entity(name=name, keywords_dict=species_dict, label='SPECIES')
def create_location_entity(nlp: Language, name: str):
return Entity(name=name, keywords_dict=location_dict, label='LOCATION')
def create_habitat_entity(nlp: Language, name: str):
return Entity(name=name, keywords_list=list(habitats.Habitat), label='HABITAT')
nlp = spacy.load('en_core_web_md')
('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ff8f6a55e20>)
start = datetime.datetime.utcnow()
all_records = None
errors = []
print(f"Start conversion at {start}")
for i, tei_path in enumerate(papers_tei.iterdir()):
records = etl.process_xml(tei_path.read_bytes(), tei_path.name, nlp)
print(f"Error with {tei_path.name}")
if all_records is None:
all_records = records
all_records = pd.concat([all_records, records], ignore_index=True)
if not i%10 and i > 0:
print(".", end="")
if not i%25 and i > 0:
print(f"{i}", end="")
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes. Total errors {len(errors)}")
Start conversion at 2021-03-22 19:46:52.720431
..25...50..75...100..125...150..175...200..225...250..275...300..325...350..375...400..425...450..475...500..525...550..575...600..625...650..675...700..725...750..775...800..825...850..875...900..925...950..975...1000..1025...1050..1075...1100..1125...1150..1175...1200..1225...1250..1275...1300..1325...1350..1375...1400..1425...1450..1475...1500..1525...1550..1575...1600..1625.Finished at 2021-03-22 19:51:58.256269, total time 5.083333333333333 minutes. Total errors 0
(12151, 8)
Paper ID | Instance ID | Species | GBIF | Time | Place | Habitats | div_enum | |
0 | fhl_2011_Brezicha_25959.tei.xml | urn:lsid:marinespecies.org:taxname:240762 | Strongylocentrotus franciscanus | https://www.gbif.org/species/search?q=Strongyl... | 2011-12-09 | [(1505051, Friday Harbor), (1508076, San Juan ... | [sub-tidal, close to shore, river, tree] | 1 |
1 | fhl_2011_Brezicha_25959.tei.xml | urn:lsid:marinespecies.org:taxname:431072 | radius | https://www.gbif.org/species/search?q=radius&q... | 2011-12-09 | [(1508076, San Juan Channel), (1507585, Porter)] | [sea] | 3 |
2 | fhl_2011_Brezicha_25959.tei.xml | urn:lsid:marinespecies.org:taxname:711954 | Parastichopus californicus | https://www.gbif.org/species/search?q=Parastic... | 2011-12-09 | [(1507585, Porter)] | [sub-tidal, sea] | 6 |
3 | fhl_2011_Bockmon_26635.tei.xml | urn:lsid:marinespecies.org:taxname:603085 | trossulus | https://www.gbif.org/species/search?q=trossulu... | [(1514917, Jackson Beach), (1507030, north end... | [] | 3 | |
4 | fhl_2011_Bockmon_26635.tei.xml | urn:lsid:marinespecies.org:taxname:112078 | ammonia | https://www.gbif.org/species/search?q=ammonia&... | [] | [] | 11 |