Source code for olaf.pipeline.pipeline_component.candidate_term_enrichment.semantic_based_enrichment
from typing import Any, Dict, Optional
import numpy as np
from spacy.language import Language
from ...pipeline_schema import Pipeline
from ....commons.logging_config import logger
from ....data_container.candidate_term_schema import CandidateTerm
from ....data_container.enrichment_schema import Enrichment
from ..pipeline_component_schema import PipelineComponent
[docs]
class SemanticBasedEnrichment(PipelineComponent):
"""Pipeline component to enrich candidate terms based on semantic meaning
computed from embeddings similarity.
The most similar words in the vocabulary are added as synonyms.
Attributes
----------
threshold : float, optional
The threshold defines the minimum similarity score required to be synonymous.
By default the threshold is set to 0.9.
"""
def __init__(self, threshold: Optional[float] = None) -> None:
"""Initialise semantic based term enrichment instance.
Parameters
----------
threshold : float, optional
The threshold defines the minimum similarity score required to be synonymous.
By default the threshold is set to 0.9.
"""
self.threshold = threshold
self._check_parameters()
self.check_resources()
def _check_parameters(self) -> None:
"""Check wether required parameters are given and correct. If this is not the case,
suitable default ones are set or errors are raised.
Raises
------
ParameterError
Exception raised when a required parameter is missing or a wrong value is provided.
"""
if not self.threshold:
self.threshold = 0.9
logger.warning(
"No value given for threshold parameter, default will be set to 0.9."
)
[docs]
def check_resources(self) -> None:
"""Method to check that the component has access to all its required resources."""
logger.info(
"Semantic based enrichment pipeline component has no external resources to check."
)
[docs]
def optimise(self) -> None:
# TODO
"""A method to optimise the pipeline component by tuning the options."""
raise NotImplementedError
def _compute_metrics(self) -> None:
"""A method to compute component performance metrics. It is used by the optimise
method to update the options.
"""
raise NotImplementedError
[docs]
def enrich_term(self, c_term: CandidateTerm, spacy_model: Language) -> None:
"""Enrich candidate term synonyms based on most similar words in the vocabulary.
Similarity is computed based on vectors cosine similarity measure.
"""
synonyms = set()
if spacy_model.vocab.has_vector(c_term.label):
most_similar_words = spacy_model.vocab.vectors.most_similar(
np.array([spacy_model.vocab.get_vector(c_term.label)]), n=10
)
most_similar_words = tuple(
zip(
most_similar_words[0][0],
most_similar_words[1][0],
most_similar_words[2][0],
)
)
for word_key, _, similarity_score in most_similar_words:
if similarity_score > self.threshold:
synonyms.add(spacy_model.vocab.strings[word_key])
else:
break
else:
logger.info(
"%{c_term.label} has no vector, semantic enrichment can't be executed."
)
if len(synonyms) > 0:
if c_term.enrichment is None:
c_term.enrichment = Enrichment()
c_term.enrichment.add_synonyms(synonyms)
[docs]
def run(self, pipeline: Pipeline) -> None:
"""Method responsible for the component execution.
Parameters
----------
pipeline : Pipeline
The pipeline running.
"""
if not pipeline.spacy_model.vocab.has_vector("test"):
logger.error(
"""No vectors loaded with the spaCy model.
Consider use another model or another enrichment component."""
)
else:
for c_term in pipeline.candidate_terms:
self.enrich_term(c_term, pipeline.spacy_model)