lightudq.document_quality

  1import os
  2from pathlib import Path
  3from typing import Optional, Union
  4
  5from dotenv import load_dotenv
  6from pydantic import BaseModel
  7from pydantic_ai import Agent
  8
  9from lightudq.prompts import (
 10    CUSTOM_METRIC_PROMPT,
 11    FACT_CHECK_PROMPT,
 12    MISSING_QUESTIONS_PROMPT,
 13    PII_PRESENCE_CHECK_PROMPT,
 14    QNA_EXTRACT_PROMPT,
 15    SUMMARY_PROMPT,
 16)
 17from lightudq.schemas import (
 18    CustomMetric,
 19    CustomMetricResult,
 20    DocumentProfile,
 21    DocumentQualityCheckResult,
 22    InconsistentFacts,
 23    MissingQuestions,
 24    PIIPresence,
 25    QnAPairs,
 26)
 27from lightudq.utils import read_document
 28
 29load_dotenv()
 30
 31
 32class DocumentQuality:
 33    """
 34    Checks the quality of the document
 35    """
 36
 37    def __init__(
 38        self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5
 39    ):
 40        """Initialize the DocumentQuality class.
 41        Parameters
 42        ----------
 43        file_path : str
 44            The path to the document file to be analyzed.
 45        model_name : str, optional
 46            The name of the LLM model to use for analysis, available models:
 47            https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName.
 48            The default is 'openai:gpt-4o'.
 49        num_questions : int, optional
 50            The number of question-answer pairs to extract from the document, by default 5.
 51        """
 52        self.file_path = file_path
 53        self.document = read_document(file_path)
 54        self.output: Optional[DocumentQualityCheckResult] = None
 55        self.profile = None
 56        self.llm_client = Agent(model_name)
 57        self._custom_metrics = []
 58        self._num_questions = num_questions
 59
 60    def add_custom_metric(self, custom_metric: CustomMetric):
 61        """Add a custom metric to the DocumentQuality instance.
 62        Parameters
 63        ----------
 64        custom_metric : CustomMetric
 65            A pydantic model containing the custom metric details.
 66        """
 67        if custom_metric.name in [cm.name for cm in self._custom_metrics]:
 68            raise ValueError(
 69                f"Custom metric with name {custom_metric.name} already exists."
 70            )
 71        self._custom_metrics.append(custom_metric)
 72
 73    def get_custom_metrics(self) -> list[CustomMetric]:
 74        """Get the list of custom metrics added to the DocumentQuality instance.
 75        Returns
 76        -------
 77        list[CustomMetric]: A list of custom metrics.
 78        """
 79        return self._custom_metrics
 80
 81    def remove_custom_metric(self, custom_metric_name: str):
 82        """Remove a custom metric from the DocumentQuality instance by name.
 83        Parameters
 84        ----------
 85        custom_metric_name : str
 86            The name of the custom metric to be removed.
 87        """
 88        self._custom_metrics = [
 89            cm for cm in self._custom_metrics if cm.name != custom_metric_name
 90        ]
 91
 92    def run(self) -> DocumentQualityCheckResult:
 93        """Run the document quality checks and return the results.
 94        Returns
 95        -------
 96        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
 97        """
 98        current_profile = self.get_document_profile()
 99        inconsistency_metric = self.compute_fact_checks(
100            facts=current_profile.qnaPairs.answers
101        )
102        pii_metric = self.pii_presence_check()
103        custom_metric_res = []
104        for custom_metric in self._custom_metrics:
105            custom_metric_output = self.get_custom_metric(custom_metric)
106            custom_metric_res.append(
107                CustomMetricResult(name=custom_metric.name, result=custom_metric_output)
108            )
109        return DocumentQualityCheckResult(
110            profile=current_profile,
111            inconsistency=inconsistency_metric,
112            pii=pii_metric,
113            customMetrics=custom_metric_res if custom_metric_res else None,
114        )
115
116    def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult:
117        """Compare the document quality against a reference profile.
118        Parameters
119        ----------
120        reference_profile : DocumentProfile
121            The reference profile to compare against.
122        Returns
123        -------
124        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
125        """
126        incompleteness = self.incompleteness_metric(
127            questions=reference_profile.qnaPairs.questions
128        )
129        if self.profile is None:
130            self.profile = self.get_document_profile()
131        inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers)
132        pii = self.pii_presence_check()
133        inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers)
134
135        return DocumentQualityCheckResult(
136            profile=self.profile,
137            inconsistency=inconsistency,
138            incompleteness=incompleteness,
139            pii=pii,
140            inaccuracy=inaccuracy,
141        )
142
143    def get_response_from_llm(
144        self, msg: str, output_model: Optional[type[BaseModel]] = None
145    ) -> Union[BaseModel, str]:
146        """get response from LLM for a given message and output model
147        Parameters
148        ----------
149        msg : str
150            The message to send to the LLM
151        output_model : Type[BaseModel], optional
152            pydantic model to parse the output, by default None.
153
154        Returns
155        -------
156        a pydantic model instance
157        """
158
159        res = self.llm_client.run_sync(msg, output_type=output_model)
160        return res.output
161
162    def extract_qna(self) -> QnAPairs:
163        """extract pairs of questions and answers from a document
164        Returns:
165        -------
166        QnAPairs: a pydantic model containing the list of questions and answers
167
168        """
169        prompt = QNA_EXTRACT_PROMPT.format(
170            document=self.document,
171            output_schema=QnAPairs.model_json_schema(),
172            num_questions=self._num_questions,
173        )
174        resp = self.get_response_from_llm(prompt, QnAPairs)
175        return resp
176
177    def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts:
178        """Checks whether the provided facts are consistent against the document
179        Parameters
180        ----------
181        facts : list[str]
182            The list of facts to check against the document
183        Returns
184        -------
185        InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
186        """
187        prompt = FACT_CHECK_PROMPT.format(
188            document=self.document,
189            output_schema=InconsistentFacts.model_json_schema(),
190            facts=facts,
191        )
192        resp = self.get_response_from_llm(prompt, InconsistentFacts)
193        return resp
194
195    def incompleteness_metric(self, questions: list[str]) -> MissingQuestions:
196        """check for questions not answered in a document
197        Parameters
198        ----------
199        questions : list[str]
200            The list of questions to check against the document
201        Returns
202        -------
203        MissingQuestions: a pydantic model containing the list of questions not answered in the document
204        """
205        prompt = MISSING_QUESTIONS_PROMPT.format(
206            document=self.document,
207            questions=questions,
208            output_schema=MissingQuestions.model_json_schema(),
209        )
210        resp = self.get_response_from_llm(prompt, MissingQuestions)
211        return resp
212
213    def pii_presence_check(self) -> PIIPresence:
214        """check for presence of PII in a document
215
216        Returns
217        -------
218        PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
219        """
220        prompt = PII_PRESENCE_CHECK_PROMPT.format(
221            document=self.document, output_schema=PIIPresence.model_json_schema()
222        )
223        resp = self.get_response_from_llm(prompt, PIIPresence)
224        return resp
225
226    def get_word_count(self) -> int:
227        """get the word count of a document
228        Returns
229        -------
230        int: the number of words in the document
231        """
232
233        content = self.document
234        words = content.strip().split()
235        return len(words)
236
237    def get_doc_summary(self) -> str:
238        """get a summary of a document
239        Returns
240        -------
241        str: the summary of the document
242        """
243        prompt = SUMMARY_PROMPT.format(document=self.document)
244        resp = self.get_response_from_llm(prompt)
245        return resp
246
247    def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel:
248        """get a custom metric for a document"""
249        prompt = CUSTOM_METRIC_PROMPT.format(
250            document=self.document,
251            output_schema=custom_metric.outputModel.model_json_schema(),
252            prompt=custom_metric.prompt,
253        )
254        resp = self.get_response_from_llm(prompt, custom_metric.outputModel)
255        return resp
256
257    def get_document_profile(self) -> DocumentProfile:
258        """get the profile of a document
259        Returns
260        -------
261        DocumentProfile: a pydantic model containing profile of the document
262        """
263        if self.profile:
264            return self.profile
265
266        qna = self.extract_qna()
267        word_count = self.get_word_count()
268        summary = self.get_doc_summary()
269        title = os.path.basename(self.file_path)
270        file_type = Path(self.file_path).suffix
271        size = Path(self.file_path).stat().st_size
272
273        self.profile = DocumentProfile(
274            title=title,
275            wordCount=word_count,
276            qnaPairs=qna,
277            summary=summary,
278            fileType=file_type,
279            fileSize=size,
280        )
281        return self.profile
class DocumentQuality:
 33class DocumentQuality:
 34    """
 35    Checks the quality of the document
 36    """
 37
 38    def __init__(
 39        self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5
 40    ):
 41        """Initialize the DocumentQuality class.
 42        Parameters
 43        ----------
 44        file_path : str
 45            The path to the document file to be analyzed.
 46        model_name : str, optional
 47            The name of the LLM model to use for analysis, available models:
 48            https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName.
 49            The default is 'openai:gpt-4o'.
 50        num_questions : int, optional
 51            The number of question-answer pairs to extract from the document, by default 5.
 52        """
 53        self.file_path = file_path
 54        self.document = read_document(file_path)
 55        self.output: Optional[DocumentQualityCheckResult] = None
 56        self.profile = None
 57        self.llm_client = Agent(model_name)
 58        self._custom_metrics = []
 59        self._num_questions = num_questions
 60
 61    def add_custom_metric(self, custom_metric: CustomMetric):
 62        """Add a custom metric to the DocumentQuality instance.
 63        Parameters
 64        ----------
 65        custom_metric : CustomMetric
 66            A pydantic model containing the custom metric details.
 67        """
 68        if custom_metric.name in [cm.name for cm in self._custom_metrics]:
 69            raise ValueError(
 70                f"Custom metric with name {custom_metric.name} already exists."
 71            )
 72        self._custom_metrics.append(custom_metric)
 73
 74    def get_custom_metrics(self) -> list[CustomMetric]:
 75        """Get the list of custom metrics added to the DocumentQuality instance.
 76        Returns
 77        -------
 78        list[CustomMetric]: A list of custom metrics.
 79        """
 80        return self._custom_metrics
 81
 82    def remove_custom_metric(self, custom_metric_name: str):
 83        """Remove a custom metric from the DocumentQuality instance by name.
 84        Parameters
 85        ----------
 86        custom_metric_name : str
 87            The name of the custom metric to be removed.
 88        """
 89        self._custom_metrics = [
 90            cm for cm in self._custom_metrics if cm.name != custom_metric_name
 91        ]
 92
 93    def run(self) -> DocumentQualityCheckResult:
 94        """Run the document quality checks and return the results.
 95        Returns
 96        -------
 97        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
 98        """
 99        current_profile = self.get_document_profile()
100        inconsistency_metric = self.compute_fact_checks(
101            facts=current_profile.qnaPairs.answers
102        )
103        pii_metric = self.pii_presence_check()
104        custom_metric_res = []
105        for custom_metric in self._custom_metrics:
106            custom_metric_output = self.get_custom_metric(custom_metric)
107            custom_metric_res.append(
108                CustomMetricResult(name=custom_metric.name, result=custom_metric_output)
109            )
110        return DocumentQualityCheckResult(
111            profile=current_profile,
112            inconsistency=inconsistency_metric,
113            pii=pii_metric,
114            customMetrics=custom_metric_res if custom_metric_res else None,
115        )
116
117    def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult:
118        """Compare the document quality against a reference profile.
119        Parameters
120        ----------
121        reference_profile : DocumentProfile
122            The reference profile to compare against.
123        Returns
124        -------
125        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
126        """
127        incompleteness = self.incompleteness_metric(
128            questions=reference_profile.qnaPairs.questions
129        )
130        if self.profile is None:
131            self.profile = self.get_document_profile()
132        inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers)
133        pii = self.pii_presence_check()
134        inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers)
135
136        return DocumentQualityCheckResult(
137            profile=self.profile,
138            inconsistency=inconsistency,
139            incompleteness=incompleteness,
140            pii=pii,
141            inaccuracy=inaccuracy,
142        )
143
144    def get_response_from_llm(
145        self, msg: str, output_model: Optional[type[BaseModel]] = None
146    ) -> Union[BaseModel, str]:
147        """get response from LLM for a given message and output model
148        Parameters
149        ----------
150        msg : str
151            The message to send to the LLM
152        output_model : Type[BaseModel], optional
153            pydantic model to parse the output, by default None.
154
155        Returns
156        -------
157        a pydantic model instance
158        """
159
160        res = self.llm_client.run_sync(msg, output_type=output_model)
161        return res.output
162
163    def extract_qna(self) -> QnAPairs:
164        """extract pairs of questions and answers from a document
165        Returns:
166        -------
167        QnAPairs: a pydantic model containing the list of questions and answers
168
169        """
170        prompt = QNA_EXTRACT_PROMPT.format(
171            document=self.document,
172            output_schema=QnAPairs.model_json_schema(),
173            num_questions=self._num_questions,
174        )
175        resp = self.get_response_from_llm(prompt, QnAPairs)
176        return resp
177
178    def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts:
179        """Checks whether the provided facts are consistent against the document
180        Parameters
181        ----------
182        facts : list[str]
183            The list of facts to check against the document
184        Returns
185        -------
186        InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
187        """
188        prompt = FACT_CHECK_PROMPT.format(
189            document=self.document,
190            output_schema=InconsistentFacts.model_json_schema(),
191            facts=facts,
192        )
193        resp = self.get_response_from_llm(prompt, InconsistentFacts)
194        return resp
195
196    def incompleteness_metric(self, questions: list[str]) -> MissingQuestions:
197        """check for questions not answered in a document
198        Parameters
199        ----------
200        questions : list[str]
201            The list of questions to check against the document
202        Returns
203        -------
204        MissingQuestions: a pydantic model containing the list of questions not answered in the document
205        """
206        prompt = MISSING_QUESTIONS_PROMPT.format(
207            document=self.document,
208            questions=questions,
209            output_schema=MissingQuestions.model_json_schema(),
210        )
211        resp = self.get_response_from_llm(prompt, MissingQuestions)
212        return resp
213
214    def pii_presence_check(self) -> PIIPresence:
215        """check for presence of PII in a document
216
217        Returns
218        -------
219        PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
220        """
221        prompt = PII_PRESENCE_CHECK_PROMPT.format(
222            document=self.document, output_schema=PIIPresence.model_json_schema()
223        )
224        resp = self.get_response_from_llm(prompt, PIIPresence)
225        return resp
226
227    def get_word_count(self) -> int:
228        """get the word count of a document
229        Returns
230        -------
231        int: the number of words in the document
232        """
233
234        content = self.document
235        words = content.strip().split()
236        return len(words)
237
238    def get_doc_summary(self) -> str:
239        """get a summary of a document
240        Returns
241        -------
242        str: the summary of the document
243        """
244        prompt = SUMMARY_PROMPT.format(document=self.document)
245        resp = self.get_response_from_llm(prompt)
246        return resp
247
248    def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel:
249        """get a custom metric for a document"""
250        prompt = CUSTOM_METRIC_PROMPT.format(
251            document=self.document,
252            output_schema=custom_metric.outputModel.model_json_schema(),
253            prompt=custom_metric.prompt,
254        )
255        resp = self.get_response_from_llm(prompt, custom_metric.outputModel)
256        return resp
257
258    def get_document_profile(self) -> DocumentProfile:
259        """get the profile of a document
260        Returns
261        -------
262        DocumentProfile: a pydantic model containing profile of the document
263        """
264        if self.profile:
265            return self.profile
266
267        qna = self.extract_qna()
268        word_count = self.get_word_count()
269        summary = self.get_doc_summary()
270        title = os.path.basename(self.file_path)
271        file_type = Path(self.file_path).suffix
272        size = Path(self.file_path).stat().st_size
273
274        self.profile = DocumentProfile(
275            title=title,
276            wordCount=word_count,
277            qnaPairs=qna,
278            summary=summary,
279            fileType=file_type,
280            fileSize=size,
281        )
282        return self.profile

Checks the quality of the document

DocumentQuality( file_path: str, model_name: str = 'openai:gpt-4o', num_questions: int = 5)
38    def __init__(
39        self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5
40    ):
41        """Initialize the DocumentQuality class.
42        Parameters
43        ----------
44        file_path : str
45            The path to the document file to be analyzed.
46        model_name : str, optional
47            The name of the LLM model to use for analysis, available models:
48            https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName.
49            The default is 'openai:gpt-4o'.
50        num_questions : int, optional
51            The number of question-answer pairs to extract from the document, by default 5.
52        """
53        self.file_path = file_path
54        self.document = read_document(file_path)
55        self.output: Optional[DocumentQualityCheckResult] = None
56        self.profile = None
57        self.llm_client = Agent(model_name)
58        self._custom_metrics = []
59        self._num_questions = num_questions

Initialize the DocumentQuality class.

Parameters

file_path : str The path to the document file to be analyzed. model_name : str, optional The name of the LLM model to use for analysis, available models: https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. The default is 'openai:gpt-4o'. num_questions : int, optional The number of question-answer pairs to extract from the document, by default 5.

file_path
document
profile
llm_client
def add_custom_metric(self, custom_metric: lightudq.schemas.CustomMetric):
61    def add_custom_metric(self, custom_metric: CustomMetric):
62        """Add a custom metric to the DocumentQuality instance.
63        Parameters
64        ----------
65        custom_metric : CustomMetric
66            A pydantic model containing the custom metric details.
67        """
68        if custom_metric.name in [cm.name for cm in self._custom_metrics]:
69            raise ValueError(
70                f"Custom metric with name {custom_metric.name} already exists."
71            )
72        self._custom_metrics.append(custom_metric)

Add a custom metric to the DocumentQuality instance.

Parameters

custom_metric : CustomMetric A pydantic model containing the custom metric details.

def get_custom_metrics(self) -> list[lightudq.schemas.CustomMetric]:
74    def get_custom_metrics(self) -> list[CustomMetric]:
75        """Get the list of custom metrics added to the DocumentQuality instance.
76        Returns
77        -------
78        list[CustomMetric]: A list of custom metrics.
79        """
80        return self._custom_metrics

Get the list of custom metrics added to the DocumentQuality instance.

Returns

list[CustomMetric]: A list of custom metrics.

def remove_custom_metric(self, custom_metric_name: str):
82    def remove_custom_metric(self, custom_metric_name: str):
83        """Remove a custom metric from the DocumentQuality instance by name.
84        Parameters
85        ----------
86        custom_metric_name : str
87            The name of the custom metric to be removed.
88        """
89        self._custom_metrics = [
90            cm for cm in self._custom_metrics if cm.name != custom_metric_name
91        ]

Remove a custom metric from the DocumentQuality instance by name.

Parameters

custom_metric_name : str The name of the custom metric to be removed.

def run(self) -> lightudq.schemas.DocumentQualityCheckResult:
 93    def run(self) -> DocumentQualityCheckResult:
 94        """Run the document quality checks and return the results.
 95        Returns
 96        -------
 97        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
 98        """
 99        current_profile = self.get_document_profile()
100        inconsistency_metric = self.compute_fact_checks(
101            facts=current_profile.qnaPairs.answers
102        )
103        pii_metric = self.pii_presence_check()
104        custom_metric_res = []
105        for custom_metric in self._custom_metrics:
106            custom_metric_output = self.get_custom_metric(custom_metric)
107            custom_metric_res.append(
108                CustomMetricResult(name=custom_metric.name, result=custom_metric_output)
109            )
110        return DocumentQualityCheckResult(
111            profile=current_profile,
112            inconsistency=inconsistency_metric,
113            pii=pii_metric,
114            customMetrics=custom_metric_res if custom_metric_res else None,
115        )

Run the document quality checks and return the results.

Returns

DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.

def compare( self, reference_profile: lightudq.schemas.DocumentProfile) -> lightudq.schemas.DocumentQualityCheckResult:
117    def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult:
118        """Compare the document quality against a reference profile.
119        Parameters
120        ----------
121        reference_profile : DocumentProfile
122            The reference profile to compare against.
123        Returns
124        -------
125        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
126        """
127        incompleteness = self.incompleteness_metric(
128            questions=reference_profile.qnaPairs.questions
129        )
130        if self.profile is None:
131            self.profile = self.get_document_profile()
132        inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers)
133        pii = self.pii_presence_check()
134        inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers)
135
136        return DocumentQualityCheckResult(
137            profile=self.profile,
138            inconsistency=inconsistency,
139            incompleteness=incompleteness,
140            pii=pii,
141            inaccuracy=inaccuracy,
142        )

Compare the document quality against a reference profile.

Parameters

reference_profile : DocumentProfile The reference profile to compare against.

Returns

DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.

def get_response_from_llm( self, msg: str, output_model: Optional[type[pydantic.main.BaseModel]] = None) -> Union[pydantic.main.BaseModel, str]:
144    def get_response_from_llm(
145        self, msg: str, output_model: Optional[type[BaseModel]] = None
146    ) -> Union[BaseModel, str]:
147        """get response from LLM for a given message and output model
148        Parameters
149        ----------
150        msg : str
151            The message to send to the LLM
152        output_model : Type[BaseModel], optional
153            pydantic model to parse the output, by default None.
154
155        Returns
156        -------
157        a pydantic model instance
158        """
159
160        res = self.llm_client.run_sync(msg, output_type=output_model)
161        return res.output

get response from LLM for a given message and output model

Parameters

msg : str The message to send to the LLM output_model : Type[BaseModel], optional pydantic model to parse the output, by default None.

Returns

a pydantic model instance

def extract_qna(self) -> lightudq.schemas.QnAPairs:
163    def extract_qna(self) -> QnAPairs:
164        """extract pairs of questions and answers from a document
165        Returns:
166        -------
167        QnAPairs: a pydantic model containing the list of questions and answers
168
169        """
170        prompt = QNA_EXTRACT_PROMPT.format(
171            document=self.document,
172            output_schema=QnAPairs.model_json_schema(),
173            num_questions=self._num_questions,
174        )
175        resp = self.get_response_from_llm(prompt, QnAPairs)
176        return resp

extract pairs of questions and answers from a document

Returns:

QnAPairs: a pydantic model containing the list of questions and answers

def compute_fact_checks(self, facts: list[str]) -> lightudq.schemas.InconsistentFacts:
178    def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts:
179        """Checks whether the provided facts are consistent against the document
180        Parameters
181        ----------
182        facts : list[str]
183            The list of facts to check against the document
184        Returns
185        -------
186        InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
187        """
188        prompt = FACT_CHECK_PROMPT.format(
189            document=self.document,
190            output_schema=InconsistentFacts.model_json_schema(),
191            facts=facts,
192        )
193        resp = self.get_response_from_llm(prompt, InconsistentFacts)
194        return resp

Checks whether the provided facts are consistent against the document

Parameters

facts : list[str] The list of facts to check against the document

Returns

InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any

def incompleteness_metric(self, questions: list[str]) -> lightudq.schemas.MissingQuestions:
196    def incompleteness_metric(self, questions: list[str]) -> MissingQuestions:
197        """check for questions not answered in a document
198        Parameters
199        ----------
200        questions : list[str]
201            The list of questions to check against the document
202        Returns
203        -------
204        MissingQuestions: a pydantic model containing the list of questions not answered in the document
205        """
206        prompt = MISSING_QUESTIONS_PROMPT.format(
207            document=self.document,
208            questions=questions,
209            output_schema=MissingQuestions.model_json_schema(),
210        )
211        resp = self.get_response_from_llm(prompt, MissingQuestions)
212        return resp

check for questions not answered in a document

Parameters

questions : list[str] The list of questions to check against the document

Returns

MissingQuestions: a pydantic model containing the list of questions not answered in the document

def pii_presence_check(self) -> lightudq.schemas.PIIPresence:
214    def pii_presence_check(self) -> PIIPresence:
215        """check for presence of PII in a document
216
217        Returns
218        -------
219        PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
220        """
221        prompt = PII_PRESENCE_CHECK_PROMPT.format(
222            document=self.document, output_schema=PIIPresence.model_json_schema()
223        )
224        resp = self.get_response_from_llm(prompt, PIIPresence)
225        return resp

check for presence of PII in a document

Returns

PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found

def get_word_count(self) -> int:
227    def get_word_count(self) -> int:
228        """get the word count of a document
229        Returns
230        -------
231        int: the number of words in the document
232        """
233
234        content = self.document
235        words = content.strip().split()
236        return len(words)

get the word count of a document

Returns

int: the number of words in the document

def get_doc_summary(self) -> str:
238    def get_doc_summary(self) -> str:
239        """get a summary of a document
240        Returns
241        -------
242        str: the summary of the document
243        """
244        prompt = SUMMARY_PROMPT.format(document=self.document)
245        resp = self.get_response_from_llm(prompt)
246        return resp

get a summary of a document

Returns

str: the summary of the document

def get_custom_metric( self, custom_metric: lightudq.schemas.CustomMetric) -> pydantic.main.BaseModel:
248    def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel:
249        """get a custom metric for a document"""
250        prompt = CUSTOM_METRIC_PROMPT.format(
251            document=self.document,
252            output_schema=custom_metric.outputModel.model_json_schema(),
253            prompt=custom_metric.prompt,
254        )
255        resp = self.get_response_from_llm(prompt, custom_metric.outputModel)
256        return resp

get a custom metric for a document

def get_document_profile(self) -> lightudq.schemas.DocumentProfile:
258    def get_document_profile(self) -> DocumentProfile:
259        """get the profile of a document
260        Returns
261        -------
262        DocumentProfile: a pydantic model containing profile of the document
263        """
264        if self.profile:
265            return self.profile
266
267        qna = self.extract_qna()
268        word_count = self.get_word_count()
269        summary = self.get_doc_summary()
270        title = os.path.basename(self.file_path)
271        file_type = Path(self.file_path).suffix
272        size = Path(self.file_path).stat().st_size
273
274        self.profile = DocumentProfile(
275            title=title,
276            wordCount=word_count,
277            qnaPairs=qna,
278            summary=summary,
279            fileType=file_type,
280            fileSize=size,
281        )
282        return self.profile

get the profile of a document

Returns

DocumentProfile: a pydantic model containing profile of the document