Source code for olaf.pipeline.pipeline_schema

from typing import List, Optional

import spacy

from ..commons.errors import PipelineCorpusInitialisationError
from ..data_container.knowledge_representation_schema import KnowledgeRepresentation
from .data_preprocessing.data_preprocessing_schema import DataPreprocessing
from .pipeline_component.pipeline_component_schema import PipelineComponent
from ..repository.corpus_loader.corpus_loader_schema import CorpusLoader


[docs] class Pipeline: """A Pipeline is the library main class. It orchestrates the pipeline starting from raw texts to build the final knowledge representation. The corpus loader is responsible for the conversion for raw text to spacy document. We separate data preprocessing to explicitly enable pipelines without preprocessing. Parameters ---------- spacy_model: spacy.language.Language The spacy model used to represent text corpus. pipeline_components: List[PipelineComponent] The ontology learning pipeline components that build the knowledge representation from the corpus. preprocessing_components: List[DataPreprocessing] The pipeline components specific to preprocessing. corpus_loader: CorpusLoader The component that loads the text corpus in the format used by the framework, i.e., a List[spacy.tokens.doc.Doc]. corpus: List[spacy.tokens.doc.Doc] The preprocessed corpus the knowledge representation is built from. kr: KnowledgeRepresentation The knowledge extracted from the corpus. candidate_terms: Set[CandidateTerms] The candidate terms extracted and processed to create concept and relations. """ def __init__( self, spacy_model: spacy.language.Language, pipeline_components: Optional[List[PipelineComponent]] = None, preprocessing_components: Optional[List[DataPreprocessing]] = None, corpus_loader: Optional[CorpusLoader] = None, corpus: Optional[List[spacy.tokens.doc.Doc]] = None, seed_kr: Optional[KnowledgeRepresentation] = None, ) -> None: """Initialise Pipeline instance. Parameters ---------- spacy_model: spacy.language.Language The spacy model used to represent text corpus. pipeline_components: List[PipelineComponent], optional The ontology learning pipeline components that build the knowledge representation from the corpus, by default None. preprocessing_components: List[DataPreprocessing], optional The pipeline components specific to preprocessing, by default None. corpus_loader: CorpusLoader The component that loads the text corpus in the format used by the framework, i.e., a List[spacy.tokens.doc.Doc]. corpus: List[spacy.tokens.doc.Doc], optional The preprocessed corpus the knowledge representation is built from, by default None. seed_kr: KnowledgeRepresentation, optional An initial knowledge representation to work with, by default None. """ self.pipeline_components = pipeline_components self.preprocessing_components = preprocessing_components self.spacy_model = spacy_model self.corpus_loader = corpus_loader self.corpus = corpus self.kr = seed_kr self.candidate_terms = set() if self.preprocessing_components is None: self.preprocessing_components = [] if self.pipeline_components is None: self.pipeline_components = [] if self.corpus is None: if self.corpus_loader is None: raise PipelineCorpusInitialisationError else: self.corpus = self.corpus_loader(self.spacy_model) if self.kr is None: self.kr = KnowledgeRepresentation()
[docs] def build(self) -> None: """Effectively build the pipeline, making the instance runnable. This method check each components and the constrained order. """ # TODO : Check that the order of the pipeline components is valid. for component in self.pipeline_components: component.check_resources()
[docs] def add_preprocessing_component( self, preprocessing_component: DataPreprocessing ) -> None: """Add a preprocessing component to the pipeline. Parameters ---------- preprocessing_component : DataPreprocessing The preprocessing pipeline component to add. """ self.preprocessing_components.append(preprocessing_component)
[docs] def remove_preprocessing_component( self, preprocessing_component: DataPreprocessing ) -> None: """Remove a preprocessing component from the pipeline. Parameters ---------- preprocessing_component : DataPreprocessing The preprocessing pipeline component to remove. """ self.preprocessing_components.remove(preprocessing_component)
[docs] def add_pipeline_component(self, pipeline_component: PipelineComponent) -> None: """Add a component to the pipeline. Parameters ---------- pipeline_component : PipelineComponent The pipeline component to add. """ self.pipeline_components.append(pipeline_component)
[docs] def remove_pipeline_component(self, pipeline_component: PipelineComponent) -> None: """Remove a component from the pipeline. Parameters ---------- pipeline_component : PipelineComponent The pipeline component to remove. """ self.pipeline_components.remove(pipeline_component)
[docs] def run(self) -> None: """Run the pipeline. The method hence run each pipeline components in the determined order filling the Knowledge Representation. """ for component in self.preprocessing_components: component.run(self) for component in self.pipeline_components: component.run(self)