Source code for olaf.pipeline.pipeline_component.candidate_term_enrichment.semantic_based_enrichment

from typing import Any, Dict, Optional

import numpy as np
from spacy.language import Language

from ...pipeline_schema import Pipeline
from ....commons.logging_config import logger
from ....data_container.candidate_term_schema import CandidateTerm
from ....data_container.enrichment_schema import Enrichment
from ..pipeline_component_schema import PipelineComponent


[docs] class SemanticBasedEnrichment(PipelineComponent): """Pipeline component to enrich candidate terms based on semantic meaning computed from embeddings similarity. The most similar words in the vocabulary are added as synonyms. Attributes ---------- threshold : float, optional The threshold defines the minimum similarity score required to be synonymous. By default the threshold is set to 0.9. """ def __init__(self, threshold: Optional[float] = None) -> None: """Initialise semantic based term enrichment instance. Parameters ---------- threshold : float, optional The threshold defines the minimum similarity score required to be synonymous. By default the threshold is set to 0.9. """ self.threshold = threshold self._check_parameters() self.check_resources() def _check_parameters(self) -> None: """Check wether required parameters are given and correct. If this is not the case, suitable default ones are set or errors are raised. Raises ------ ParameterError Exception raised when a required parameter is missing or a wrong value is provided. """ if not self.threshold: self.threshold = 0.9 logger.warning( "No value given for threshold parameter, default will be set to 0.9." )
[docs] def check_resources(self) -> None: """Method to check that the component has access to all its required resources.""" logger.info( "Semantic based enrichment pipeline component has no external resources to check." )
[docs] def optimise(self) -> None: # TODO """A method to optimise the pipeline component by tuning the options.""" raise NotImplementedError
def _compute_metrics(self) -> None: """A method to compute component performance metrics. It is used by the optimise method to update the options. """ raise NotImplementedError
[docs] def get_performance_report(self) -> Dict[str, Any]: """A getter for the pipeline component performance report. If the component has been optimised, it only returns the best performance. Otherwise, it returns the results obtained with the set parameters. Returns ------- Dict[str, Any] The pipeline component performance report. """ raise NotImplementedError
[docs] def enrich_term(self, c_term: CandidateTerm, spacy_model: Language) -> None: """Enrich candidate term synonyms based on most similar words in the vocabulary. Similarity is computed based on vectors cosine similarity measure. """ synonyms = set() if spacy_model.vocab.has_vector(c_term.label): most_similar_words = spacy_model.vocab.vectors.most_similar( np.array([spacy_model.vocab.get_vector(c_term.label)]), n=10 ) most_similar_words = tuple( zip( most_similar_words[0][0], most_similar_words[1][0], most_similar_words[2][0], ) ) for word_key, _, similarity_score in most_similar_words: if similarity_score > self.threshold: synonyms.add(spacy_model.vocab.strings[word_key]) else: break else: logger.info( "%{c_term.label} has no vector, semantic enrichment can't be executed." ) if len(synonyms) > 0: if c_term.enrichment is None: c_term.enrichment = Enrichment() c_term.enrichment.add_synonyms(synonyms)
[docs] def run(self, pipeline: Pipeline) -> None: """Method responsible for the component execution. Parameters ---------- pipeline : Pipeline The pipeline running. """ if not pipeline.spacy_model.vocab.has_vector("test"): logger.error( """No vectors loaded with the spaCy model. Consider use another model or another enrichment component.""" ) else: for c_term in pipeline.candidate_terms: self.enrich_term(c_term, pipeline.spacy_model)