Source code for gilda.api

__all__ = [
    'ground',
    'get_models',
    'get_names',
    'get_grounder',
    'make_grounder',
    'annotate',
]

from typing import List, Mapping, Union, Optional

from gilda.grounder import Grounder
from gilda.term import Term


class GrounderInstance(object):
    def __init__(self):
        self.grounder = None

    def get_grounder(self):
        if self.grounder is None:
            self.grounder = Grounder()
        return self.grounder

    def ground(self, text, context=None, organisms=None,
               namespaces=None):
        return self.get_grounder().ground(text, context=context,
                                          organisms=organisms,
                                          namespaces=namespaces)

    def get_models(self):
        return self.get_grounder().get_models()

    def get_names(self, db, id, status=None, source=None):
        return self.get_grounder().get_names(db, id,
                                             status=status,
                                             source=source)

    @property
    def prefix_index(self):
        return self.get_grounder().prefix_index


grounder = GrounderInstance()


[docs] def ground(text, context=None, organisms=None, namespaces=None): """Return a list of scored matches for a text to ground. Parameters ---------- text : str The entity text to be grounded. context : Optional[str] Any additional text that serves as context for disambiguating the given entity text, used if a model exists for disambiguating the given text. organisms : Optional[List[str]] A list of taxonomy identifiers to use as a priority list when surfacing matches for proteins/genes from multiple organisms. namespaces : Optional[List[str]] A list of namespaces to restrict the matches to. By default, no restriction is applied. Returns ------- list[gilda.grounder.ScoredMatch] A list of ScoredMatch objects representing the groundings. Examples -------- Ground a string corresponding to an entity name, label, or synonym >>> import gilda >>> scored_matches = gilda.ground('mapt') The matches are sorted in descending order by score, and in the event of a tie, by the namespace of the primary grounding. Each scored match has a :class:`gilda.term.Term` object that contain information about the primary grounding. >>> scored_matches[0].term.db 'hgnc' >>> scored_matches[0].term.id '6893' >>> scored_matches[0].term.get_curie() 'hgnc:6893' The score for each match can be accessed directly: >>> scored_matches[0].score 0.7623 The rationale for each match is contained in the ``match`` attribute whose fields are described in :class:`gilda.scorer.Match`: >>> match_object = scored_matches[0].match Give optional context to be used by Gilda's disambiguation models, if available >>> scored_matches = gilda.ground('ER', context='Calcium is released from the ER.') Only return results from a certain namespace, such as when a family and gene have the same name >>> scored_matches = gilda.ground('ESR', namespaces=["hgnc"]) """ return grounder.ground(text=text, context=context, organisms=organisms, namespaces=namespaces)
[docs] def annotate( text: str, sent_split_fun=None, organisms=None, namespaces=None, context_text: str = None, ): """Annotate a given text with Gilda (i.e., do named entity recognition). Parameters ---------- text : str The text to be annotated. sent_split_fun : Callable, optional A function that splits the text into sentences. The default is :func:`nltk.tokenize.sent_tokenize`. The function should take a string as input and return an iterable of strings corresponding to the sentences in the input text. organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, human is used. namespaces : list[str], optional A list of namespaces to pass to the grounder to restrict the matches to. By default, no restriction is applied. context_text : A longer span of text that serves as additional context for the text being annotated for disambiguation purposes. Returns ------- list[tuple[str, list[ScoredMatch], int, int]] A list of matches where each match is a tuple consisting of the matches text span, the list of ScoredMatches, and the start and end character offsets of the text span. """ from .ner import annotate as _annotate return _annotate( text, grounder=grounder, sent_split_fun=sent_split_fun, organisms=organisms, namespaces=namespaces, context_text=context_text, )
[docs] def get_models(): """Return a list of entity texts for which disambiguation models exist. Returns ------- list[str] The list of entity texts for which a disambiguation model is available. """ return grounder.get_models()
[docs] def get_names(db, id, status=None, source=None): """Return a list of entity texts corresponding to a given database ID. Parameters ---------- db : str The database in which the ID is an entry, e.g., HGNC. id : str The ID of an entry in the database. status : Optional[str] If given, only entity texts with the given status e.g., "synonym" are returned. source : Optional[str] If given, only entity texts from the given source e.g., "uniprot" are returned. """ return grounder.get_names(db, id, status=status, source=source)
[docs] def get_grounder() -> Grounder: """Initialize and return the default Grounder instance. Returns ------- : A Grounder instance whose attributes and methods can be used directly. """ return grounder.get_grounder()
[docs] def make_grounder( terms: Union[str, List[Term], Mapping[str, List[Term]]], ) -> Grounder: """Create a custom grounder from a list of Terms. Parameters ---------- terms : Specifies the grounding terms that should be loaded in the Grounder. If str, it is interpreted as a path to a grounding terms gzipped TSV file which is then loaded. If list, it is assumed to be a flat list of Terms. If dict, it is assumed to be a grounding terms dict with normalized entity strings as keys and lists of Term objects as values. Default: None Returns ------- : A Grounder instance, initialized with either the default terms loaded from the resource file or a custom set of terms if the terms argument was specified. Examples -------- The following example shows how to get an ontology with :mod:`obonet` and load custom terms: .. code-block:: python from gilda import make_grounder from gilda.process import normalize from gilda import Term prefix = "UBERON" url = "http://purl.obolibrary.org/obo/uberon/basic.obo" g = obonet.read_obo(url) custom_obo_terms = [] it = tqdm(g.nodes(data=True), unit_scale=True, unit="node") for node, data in it: # Skip entries imported from other ontologies if not node.startswith(f"{prefix}:"): continue identifier = node.removeprefix(f"{prefix}:") name = data["name"] custom_obo_terms.append(gilda.Term( norm_text=normalize(name), text=name, db=prefix, id=identifier, entry_name=name, status="name", source=prefix, )) # Add terms for all synonyms for synonym_raw in data.get("synonym", []): try: # Try to parse out of the quoted OBO Field synonym = synonym_raw.split('"')[1].strip() except IndexError: continue # the synonym was malformed custom_obo_terms.append(gilda.Term( norm_text=normalize(synonym), text=synonym, db=prefix, id=identifier, entry_name=name, status="synonym", source=prefix, )) custom_grounder = gilda.make_grounder(custom_obo_terms) scored_matches = custom_grounder.ground("head") Additional examples for loading custom content from OBO Graph JSON, :mod:`pyobo`, and more can be found in the `Jupyter notebooks <https://github.com/indralab/gilda/tree/master/notebooks>`_ in the Gilda repository on GitHub. """ return Grounder(terms=terms)