1. Data Preparation for SPOC¶

%reload_ext autoreload
%autoreload 2
%matplotlib inline

import datetime
import pathlib
import pandas as pd
from lxml import etree
import spacy
from spacy_lookup import Entity
papers_tei = pathlib.Path("/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_tei")

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-1-34de32e97c1c> in <module>
      1 get_ipython().run_line_magic('reload_ext', 'autoreload')
      2 get_ipython().run_line_magic('autoreload', '2')
----> 3 get_ipython().run_line_magic('matplotlib', 'inline')
      4 
      5 import datetime

/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/interactiveshell.py in run_line_magic(self, magic_name, line, _stack_depth)
   2346                 kwargs['local_ns'] = self.get_local_scope(stack_depth)
   2347             with self.builtin_trap:
-> 2348                 result = fn(*args, **kwargs)
   2349             return result
   2350 

/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/decorator.py in fun(*args, **kw)
    230             if not kwsyntax:
    231                 args, kw = fix(args, kw, sig)
--> 232             return caller(func, *(extras + args), **kw)
    233     fun.__name__ = func.__name__
    234     fun.__doc__ = func.__doc__

/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    185     # but it's overkill for just that one bit of state.
    186     def magic_deco(arg):
--> 187         call = lambda f, *a, **k: f(*a, **k)
    188 
    189         if callable(arg):

/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/magics/pylab.py in matplotlib(self, line)
     97             print("Available matplotlib backends: %s" % backends_list)
     98         else:
---> 99             gui, backend = self.shell.enable_matplotlib(args.gui.lower() if isinstance(args.gui, str) else args.gui)
    100             self._show_matplotlib_backend(args.gui, backend)
    101 

/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/ipykernel/zmqshell.py in enable_matplotlib(self, gui)
    599 
    600     def enable_matplotlib(self, gui=None):
--> 601         gui, backend = super(ZMQInteractiveShell, self).enable_matplotlib(gui)
    602 
    603         try:

/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/IPython/core/interactiveshell.py in enable_matplotlib(self, gui)
   3515         """
   3516         from IPython.core import pylabtools as pt
-> 3517         from matplotlib_inline.backend_inline import configure_inline_support
   3518         gui, backend = pt.find_gui_and_backend(gui, self.pylab_gui_select)
   3519 

/opt/hostedtoolcache/Python/3.9.5/x64/lib/python3.9/site-packages/matplotlib_inline/backend_inline.py in <module>
      4 # Distributed under the terms of the BSD 3-Clause License.
      5 
----> 6 import matplotlib
      7 from matplotlib.backends.backend_agg import (  # noqa
      8     new_figure_manager,

ModuleNotFoundError: No module named 'matplotlib'

1.1. Extracting Full-Text from TEI XML¶

GROBID extracts the full-text from the PDFs and saves the result in a TEI XML document.

1.2. WorMS Marine Species Dataframe¶

taxon = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/WoRMS_marineSpecies/taxon.txt', sep="\t")

species = taxon[['taxonID', 'scientificName', 'references']]
species = species.rename(columns={"references": "URL"})

species.to_json("../data/species.json")

1.3. Location DataFrame¶

ca_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_CA.csv')

or_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_OR.csv')

wa_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_WA.csv')

locations = pd.concat([ca_locations, or_locations, wa_locations], ignore_index=True)

locations = locations[['FEATURE_ID', 'FEATURE_NAME', 'STATE_ALPHA', 'PRIM_LONG_DEC', 'PRIM_LAT_DEC']]
locations = locations.rename(columns={'STATE_ALPHA': 'STATE', 
                                      'PRIM_LONG_DEC': 'LONGITUDE', 
                                      'PRIM_LAT_DEC': 'LATITUDE'})

locations.to_json("../data/locations.json")

1.4. Habitat DataFrame¶

import json
habitats = []
with open("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/habitats.jsonl") as fo:
    for row in fo.readlines():
        line = json.loads(row)
        habitats.append(line.get('pattern'))
habitats = pd.DataFrame(habitats, columns=['Habitat'])

habitats.to_json("../data/habitats.json")

1.5. Load helper functions from `lib/etl.py` module¶

import sys, os
sys.path.append("../src")
import lib.etl as etl

# Uncomment these lines to load dataframes if you haven't generated the dataframes above
#species = pd.read_json('../data/species.json')
#locations = pd.read_json('../data/locations.json')
#habitats = pd.read_json('../data/habitats.json')

species_dict = dict(zip(species.taxonID, species.scientificName))
location_dict = dict(zip(locations.FEATURE_ID, locations.FEATURE_NAME))

1.6. Construct spaCy Pipeline¶

Create a spaCy nlp pipeline from the existing en_core_web_md English pipeline .

# Iterate through species and location dictionary to turn values into lists
for key, val in species_dict.items():
    species_dict[key] = [val,]
for key, val in location_dict.items():
    location_dict[key] = [val,]

1.6.1. Species, Locations, and Habitat Entity pipeline factories¶

This uses a forked version of spacey lookup that has been refactored to use spaCy 3.0.

from spacy.language import Language

@Language.factory(name='species_entity')
def create_species_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=species_dict, label='SPECIES')

@Language.factory(name='location_entity')
def create_location_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=location_dict, label='LOCATION')

@Language.factory(name='habitat_entity')
def create_habitat_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_list=list(habitats.Habitat), label='HABITAT')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('species_entity')
nlp.add_pipe('location_entity')
nlp.add_pipe('habitat_entity')
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ff8f6a55e20>)

start = datetime.datetime.utcnow()
all_records = None
errors = []
print(f"Start conversion at {start}")
for i, tei_path in enumerate(papers_tei.iterdir()):
    try:
        records = etl.process_xml(tei_path.read_bytes(), tei_path.name, nlp)
    except:
        print(f"Error with {tei_path.name}")
        errors.append(tei_path)
        continue
    if all_records is None:
        all_records = records
    else:
        all_records = pd.concat([all_records, records], ignore_index=True)
    if not i%10 and i > 0:
        print(".", end="")
    if not i%25 and i > 0:
        print(f"{i}", end="")
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes. Total errors {len(errors)}")

Start conversion at 2021-03-22 19:46:52.720431
..25...50..75...100..125...150..175...200..225...250..275...300..325...350..375...400..425...450..475...500..525...550..575...600..625...650..675...700..725...750..775...800..825...850..875...900..925...950..975...1000..1025...1050..1075...1100..1125...1150..1175...1200..1225...1250..1275...1300..1325...1350..1375...1400..1425...1450..1475...1500..1525...1550..1575...1600..1625.Finished at 2021-03-22 19:51:58.256269, total time 5.083333333333333 minutes. Total errors 0

all_records.shape

(12151, 8)

all_records.head()

	Paper ID	Instance ID	Species	GBIF	Time	Place	Habitats	div_enum
0	fhl_2011_Brezicha_25959.tei.xml	urn:lsid:marinespecies.org:taxname:240762	Strongylocentrotus franciscanus	https://www.gbif.org/species/search?q=Strongyl...	2011-12-09	[(1505051, Friday Harbor), (1508076, San Juan ...	[sub-tidal, close to shore, river, tree]	1
1	fhl_2011_Brezicha_25959.tei.xml	urn:lsid:marinespecies.org:taxname:431072	radius	https://www.gbif.org/species/search?q=radius&q...	2011-12-09	[(1508076, San Juan Channel), (1507585, Porter)]	[sea]	3
2	fhl_2011_Brezicha_25959.tei.xml	urn:lsid:marinespecies.org:taxname:711954	Parastichopus californicus	https://www.gbif.org/species/search?q=Parastic...	2011-12-09	[(1507585, Porter)]	[sub-tidal, sea]	6
3	fhl_2011_Bockmon_26635.tei.xml	urn:lsid:marinespecies.org:taxname:603085	trossulus	https://www.gbif.org/species/search?q=trossulu...		[(1514917, Jackson Beach), (1507030, north end...	[]	3
4	fhl_2011_Bockmon_26635.tei.xml	urn:lsid:marinespecies.org:taxname:112078	ammonia	https://www.gbif.org/species/search?q=ammonia&...		[]	[]	11

all_records.to_json('../data/species-records.json')

1. A Discussion of Methods in Terms of Tasks 2. GROBID PDF-to-TEI XML Workflow