lightudq.document_quality
1import os 2from pathlib import Path 3from typing import Optional, Union 4 5from dotenv import load_dotenv 6from pydantic import BaseModel 7from pydantic_ai import Agent 8 9from lightudq.prompts import ( 10 CUSTOM_METRIC_PROMPT, 11 FACT_CHECK_PROMPT, 12 MISSING_QUESTIONS_PROMPT, 13 PII_PRESENCE_CHECK_PROMPT, 14 QNA_EXTRACT_PROMPT, 15 SUMMARY_PROMPT, 16) 17from lightudq.schemas import ( 18 CustomMetric, 19 CustomMetricResult, 20 DocumentProfile, 21 DocumentQualityCheckResult, 22 InconsistentFacts, 23 MissingQuestions, 24 PIIPresence, 25 QnAPairs, 26) 27from lightudq.utils import read_document 28 29load_dotenv() 30 31 32class DocumentQuality: 33 """ 34 Checks the quality of the document 35 """ 36 37 def __init__( 38 self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5 39 ): 40 """Initialize the DocumentQuality class. 41 Parameters 42 ---------- 43 file_path : str 44 The path to the document file to be analyzed. 45 model_name : str, optional 46 The name of the LLM model to use for analysis, available models: 47 https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. 48 The default is 'openai:gpt-4o'. 49 num_questions : int, optional 50 The number of question-answer pairs to extract from the document, by default 5. 51 """ 52 self.file_path = file_path 53 self.document = read_document(file_path) 54 self.output: Optional[DocumentQualityCheckResult] = None 55 self.profile = None 56 self.llm_client = Agent(model_name) 57 self._custom_metrics = [] 58 self._num_questions = num_questions 59 60 def add_custom_metric(self, custom_metric: CustomMetric): 61 """Add a custom metric to the DocumentQuality instance. 62 Parameters 63 ---------- 64 custom_metric : CustomMetric 65 A pydantic model containing the custom metric details. 66 """ 67 if custom_metric.name in [cm.name for cm in self._custom_metrics]: 68 raise ValueError( 69 f"Custom metric with name {custom_metric.name} already exists." 70 ) 71 self._custom_metrics.append(custom_metric) 72 73 def get_custom_metrics(self) -> list[CustomMetric]: 74 """Get the list of custom metrics added to the DocumentQuality instance. 75 Returns 76 ------- 77 list[CustomMetric]: A list of custom metrics. 78 """ 79 return self._custom_metrics 80 81 def remove_custom_metric(self, custom_metric_name: str): 82 """Remove a custom metric from the DocumentQuality instance by name. 83 Parameters 84 ---------- 85 custom_metric_name : str 86 The name of the custom metric to be removed. 87 """ 88 self._custom_metrics = [ 89 cm for cm in self._custom_metrics if cm.name != custom_metric_name 90 ] 91 92 def run(self) -> DocumentQualityCheckResult: 93 """Run the document quality checks and return the results. 94 Returns 95 ------- 96 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks. 97 """ 98 current_profile = self.get_document_profile() 99 inconsistency_metric = self.compute_fact_checks( 100 facts=current_profile.qnaPairs.answers 101 ) 102 pii_metric = self.pii_presence_check() 103 custom_metric_res = [] 104 for custom_metric in self._custom_metrics: 105 custom_metric_output = self.get_custom_metric(custom_metric) 106 custom_metric_res.append( 107 CustomMetricResult(name=custom_metric.name, result=custom_metric_output) 108 ) 109 return DocumentQualityCheckResult( 110 profile=current_profile, 111 inconsistency=inconsistency_metric, 112 pii=pii_metric, 113 customMetrics=custom_metric_res if custom_metric_res else None, 114 ) 115 116 def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult: 117 """Compare the document quality against a reference profile. 118 Parameters 119 ---------- 120 reference_profile : DocumentProfile 121 The reference profile to compare against. 122 Returns 123 ------- 124 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile. 125 """ 126 incompleteness = self.incompleteness_metric( 127 questions=reference_profile.qnaPairs.questions 128 ) 129 if self.profile is None: 130 self.profile = self.get_document_profile() 131 inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers) 132 pii = self.pii_presence_check() 133 inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers) 134 135 return DocumentQualityCheckResult( 136 profile=self.profile, 137 inconsistency=inconsistency, 138 incompleteness=incompleteness, 139 pii=pii, 140 inaccuracy=inaccuracy, 141 ) 142 143 def get_response_from_llm( 144 self, msg: str, output_model: Optional[type[BaseModel]] = None 145 ) -> Union[BaseModel, str]: 146 """get response from LLM for a given message and output model 147 Parameters 148 ---------- 149 msg : str 150 The message to send to the LLM 151 output_model : Type[BaseModel], optional 152 pydantic model to parse the output, by default None. 153 154 Returns 155 ------- 156 a pydantic model instance 157 """ 158 159 res = self.llm_client.run_sync(msg, output_type=output_model) 160 return res.output 161 162 def extract_qna(self) -> QnAPairs: 163 """extract pairs of questions and answers from a document 164 Returns: 165 ------- 166 QnAPairs: a pydantic model containing the list of questions and answers 167 168 """ 169 prompt = QNA_EXTRACT_PROMPT.format( 170 document=self.document, 171 output_schema=QnAPairs.model_json_schema(), 172 num_questions=self._num_questions, 173 ) 174 resp = self.get_response_from_llm(prompt, QnAPairs) 175 return resp 176 177 def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts: 178 """Checks whether the provided facts are consistent against the document 179 Parameters 180 ---------- 181 facts : list[str] 182 The list of facts to check against the document 183 Returns 184 ------- 185 InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any 186 """ 187 prompt = FACT_CHECK_PROMPT.format( 188 document=self.document, 189 output_schema=InconsistentFacts.model_json_schema(), 190 facts=facts, 191 ) 192 resp = self.get_response_from_llm(prompt, InconsistentFacts) 193 return resp 194 195 def incompleteness_metric(self, questions: list[str]) -> MissingQuestions: 196 """check for questions not answered in a document 197 Parameters 198 ---------- 199 questions : list[str] 200 The list of questions to check against the document 201 Returns 202 ------- 203 MissingQuestions: a pydantic model containing the list of questions not answered in the document 204 """ 205 prompt = MISSING_QUESTIONS_PROMPT.format( 206 document=self.document, 207 questions=questions, 208 output_schema=MissingQuestions.model_json_schema(), 209 ) 210 resp = self.get_response_from_llm(prompt, MissingQuestions) 211 return resp 212 213 def pii_presence_check(self) -> PIIPresence: 214 """check for presence of PII in a document 215 216 Returns 217 ------- 218 PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found 219 """ 220 prompt = PII_PRESENCE_CHECK_PROMPT.format( 221 document=self.document, output_schema=PIIPresence.model_json_schema() 222 ) 223 resp = self.get_response_from_llm(prompt, PIIPresence) 224 return resp 225 226 def get_word_count(self) -> int: 227 """get the word count of a document 228 Returns 229 ------- 230 int: the number of words in the document 231 """ 232 233 content = self.document 234 words = content.strip().split() 235 return len(words) 236 237 def get_doc_summary(self) -> str: 238 """get a summary of a document 239 Returns 240 ------- 241 str: the summary of the document 242 """ 243 prompt = SUMMARY_PROMPT.format(document=self.document) 244 resp = self.get_response_from_llm(prompt) 245 return resp 246 247 def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel: 248 """get a custom metric for a document""" 249 prompt = CUSTOM_METRIC_PROMPT.format( 250 document=self.document, 251 output_schema=custom_metric.outputModel.model_json_schema(), 252 prompt=custom_metric.prompt, 253 ) 254 resp = self.get_response_from_llm(prompt, custom_metric.outputModel) 255 return resp 256 257 def get_document_profile(self) -> DocumentProfile: 258 """get the profile of a document 259 Returns 260 ------- 261 DocumentProfile: a pydantic model containing profile of the document 262 """ 263 if self.profile: 264 return self.profile 265 266 qna = self.extract_qna() 267 word_count = self.get_word_count() 268 summary = self.get_doc_summary() 269 title = os.path.basename(self.file_path) 270 file_type = Path(self.file_path).suffix 271 size = Path(self.file_path).stat().st_size 272 273 self.profile = DocumentProfile( 274 title=title, 275 wordCount=word_count, 276 qnaPairs=qna, 277 summary=summary, 278 fileType=file_type, 279 fileSize=size, 280 ) 281 return self.profile
33class DocumentQuality: 34 """ 35 Checks the quality of the document 36 """ 37 38 def __init__( 39 self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5 40 ): 41 """Initialize the DocumentQuality class. 42 Parameters 43 ---------- 44 file_path : str 45 The path to the document file to be analyzed. 46 model_name : str, optional 47 The name of the LLM model to use for analysis, available models: 48 https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. 49 The default is 'openai:gpt-4o'. 50 num_questions : int, optional 51 The number of question-answer pairs to extract from the document, by default 5. 52 """ 53 self.file_path = file_path 54 self.document = read_document(file_path) 55 self.output: Optional[DocumentQualityCheckResult] = None 56 self.profile = None 57 self.llm_client = Agent(model_name) 58 self._custom_metrics = [] 59 self._num_questions = num_questions 60 61 def add_custom_metric(self, custom_metric: CustomMetric): 62 """Add a custom metric to the DocumentQuality instance. 63 Parameters 64 ---------- 65 custom_metric : CustomMetric 66 A pydantic model containing the custom metric details. 67 """ 68 if custom_metric.name in [cm.name for cm in self._custom_metrics]: 69 raise ValueError( 70 f"Custom metric with name {custom_metric.name} already exists." 71 ) 72 self._custom_metrics.append(custom_metric) 73 74 def get_custom_metrics(self) -> list[CustomMetric]: 75 """Get the list of custom metrics added to the DocumentQuality instance. 76 Returns 77 ------- 78 list[CustomMetric]: A list of custom metrics. 79 """ 80 return self._custom_metrics 81 82 def remove_custom_metric(self, custom_metric_name: str): 83 """Remove a custom metric from the DocumentQuality instance by name. 84 Parameters 85 ---------- 86 custom_metric_name : str 87 The name of the custom metric to be removed. 88 """ 89 self._custom_metrics = [ 90 cm for cm in self._custom_metrics if cm.name != custom_metric_name 91 ] 92 93 def run(self) -> DocumentQualityCheckResult: 94 """Run the document quality checks and return the results. 95 Returns 96 ------- 97 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks. 98 """ 99 current_profile = self.get_document_profile() 100 inconsistency_metric = self.compute_fact_checks( 101 facts=current_profile.qnaPairs.answers 102 ) 103 pii_metric = self.pii_presence_check() 104 custom_metric_res = [] 105 for custom_metric in self._custom_metrics: 106 custom_metric_output = self.get_custom_metric(custom_metric) 107 custom_metric_res.append( 108 CustomMetricResult(name=custom_metric.name, result=custom_metric_output) 109 ) 110 return DocumentQualityCheckResult( 111 profile=current_profile, 112 inconsistency=inconsistency_metric, 113 pii=pii_metric, 114 customMetrics=custom_metric_res if custom_metric_res else None, 115 ) 116 117 def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult: 118 """Compare the document quality against a reference profile. 119 Parameters 120 ---------- 121 reference_profile : DocumentProfile 122 The reference profile to compare against. 123 Returns 124 ------- 125 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile. 126 """ 127 incompleteness = self.incompleteness_metric( 128 questions=reference_profile.qnaPairs.questions 129 ) 130 if self.profile is None: 131 self.profile = self.get_document_profile() 132 inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers) 133 pii = self.pii_presence_check() 134 inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers) 135 136 return DocumentQualityCheckResult( 137 profile=self.profile, 138 inconsistency=inconsistency, 139 incompleteness=incompleteness, 140 pii=pii, 141 inaccuracy=inaccuracy, 142 ) 143 144 def get_response_from_llm( 145 self, msg: str, output_model: Optional[type[BaseModel]] = None 146 ) -> Union[BaseModel, str]: 147 """get response from LLM for a given message and output model 148 Parameters 149 ---------- 150 msg : str 151 The message to send to the LLM 152 output_model : Type[BaseModel], optional 153 pydantic model to parse the output, by default None. 154 155 Returns 156 ------- 157 a pydantic model instance 158 """ 159 160 res = self.llm_client.run_sync(msg, output_type=output_model) 161 return res.output 162 163 def extract_qna(self) -> QnAPairs: 164 """extract pairs of questions and answers from a document 165 Returns: 166 ------- 167 QnAPairs: a pydantic model containing the list of questions and answers 168 169 """ 170 prompt = QNA_EXTRACT_PROMPT.format( 171 document=self.document, 172 output_schema=QnAPairs.model_json_schema(), 173 num_questions=self._num_questions, 174 ) 175 resp = self.get_response_from_llm(prompt, QnAPairs) 176 return resp 177 178 def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts: 179 """Checks whether the provided facts are consistent against the document 180 Parameters 181 ---------- 182 facts : list[str] 183 The list of facts to check against the document 184 Returns 185 ------- 186 InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any 187 """ 188 prompt = FACT_CHECK_PROMPT.format( 189 document=self.document, 190 output_schema=InconsistentFacts.model_json_schema(), 191 facts=facts, 192 ) 193 resp = self.get_response_from_llm(prompt, InconsistentFacts) 194 return resp 195 196 def incompleteness_metric(self, questions: list[str]) -> MissingQuestions: 197 """check for questions not answered in a document 198 Parameters 199 ---------- 200 questions : list[str] 201 The list of questions to check against the document 202 Returns 203 ------- 204 MissingQuestions: a pydantic model containing the list of questions not answered in the document 205 """ 206 prompt = MISSING_QUESTIONS_PROMPT.format( 207 document=self.document, 208 questions=questions, 209 output_schema=MissingQuestions.model_json_schema(), 210 ) 211 resp = self.get_response_from_llm(prompt, MissingQuestions) 212 return resp 213 214 def pii_presence_check(self) -> PIIPresence: 215 """check for presence of PII in a document 216 217 Returns 218 ------- 219 PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found 220 """ 221 prompt = PII_PRESENCE_CHECK_PROMPT.format( 222 document=self.document, output_schema=PIIPresence.model_json_schema() 223 ) 224 resp = self.get_response_from_llm(prompt, PIIPresence) 225 return resp 226 227 def get_word_count(self) -> int: 228 """get the word count of a document 229 Returns 230 ------- 231 int: the number of words in the document 232 """ 233 234 content = self.document 235 words = content.strip().split() 236 return len(words) 237 238 def get_doc_summary(self) -> str: 239 """get a summary of a document 240 Returns 241 ------- 242 str: the summary of the document 243 """ 244 prompt = SUMMARY_PROMPT.format(document=self.document) 245 resp = self.get_response_from_llm(prompt) 246 return resp 247 248 def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel: 249 """get a custom metric for a document""" 250 prompt = CUSTOM_METRIC_PROMPT.format( 251 document=self.document, 252 output_schema=custom_metric.outputModel.model_json_schema(), 253 prompt=custom_metric.prompt, 254 ) 255 resp = self.get_response_from_llm(prompt, custom_metric.outputModel) 256 return resp 257 258 def get_document_profile(self) -> DocumentProfile: 259 """get the profile of a document 260 Returns 261 ------- 262 DocumentProfile: a pydantic model containing profile of the document 263 """ 264 if self.profile: 265 return self.profile 266 267 qna = self.extract_qna() 268 word_count = self.get_word_count() 269 summary = self.get_doc_summary() 270 title = os.path.basename(self.file_path) 271 file_type = Path(self.file_path).suffix 272 size = Path(self.file_path).stat().st_size 273 274 self.profile = DocumentProfile( 275 title=title, 276 wordCount=word_count, 277 qnaPairs=qna, 278 summary=summary, 279 fileType=file_type, 280 fileSize=size, 281 ) 282 return self.profile
Checks the quality of the document
38 def __init__( 39 self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5 40 ): 41 """Initialize the DocumentQuality class. 42 Parameters 43 ---------- 44 file_path : str 45 The path to the document file to be analyzed. 46 model_name : str, optional 47 The name of the LLM model to use for analysis, available models: 48 https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. 49 The default is 'openai:gpt-4o'. 50 num_questions : int, optional 51 The number of question-answer pairs to extract from the document, by default 5. 52 """ 53 self.file_path = file_path 54 self.document = read_document(file_path) 55 self.output: Optional[DocumentQualityCheckResult] = None 56 self.profile = None 57 self.llm_client = Agent(model_name) 58 self._custom_metrics = [] 59 self._num_questions = num_questions
Initialize the DocumentQuality class.
Parameters
file_path : str The path to the document file to be analyzed. model_name : str, optional The name of the LLM model to use for analysis, available models: https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. The default is 'openai:gpt-4o'. num_questions : int, optional The number of question-answer pairs to extract from the document, by default 5.
61 def add_custom_metric(self, custom_metric: CustomMetric): 62 """Add a custom metric to the DocumentQuality instance. 63 Parameters 64 ---------- 65 custom_metric : CustomMetric 66 A pydantic model containing the custom metric details. 67 """ 68 if custom_metric.name in [cm.name for cm in self._custom_metrics]: 69 raise ValueError( 70 f"Custom metric with name {custom_metric.name} already exists." 71 ) 72 self._custom_metrics.append(custom_metric)
Add a custom metric to the DocumentQuality instance.
Parameters
custom_metric : CustomMetric A pydantic model containing the custom metric details.
74 def get_custom_metrics(self) -> list[CustomMetric]: 75 """Get the list of custom metrics added to the DocumentQuality instance. 76 Returns 77 ------- 78 list[CustomMetric]: A list of custom metrics. 79 """ 80 return self._custom_metrics
Get the list of custom metrics added to the DocumentQuality instance.
Returns
list[CustomMetric]: A list of custom metrics.
82 def remove_custom_metric(self, custom_metric_name: str): 83 """Remove a custom metric from the DocumentQuality instance by name. 84 Parameters 85 ---------- 86 custom_metric_name : str 87 The name of the custom metric to be removed. 88 """ 89 self._custom_metrics = [ 90 cm for cm in self._custom_metrics if cm.name != custom_metric_name 91 ]
Remove a custom metric from the DocumentQuality instance by name.
Parameters
custom_metric_name : str The name of the custom metric to be removed.
93 def run(self) -> DocumentQualityCheckResult: 94 """Run the document quality checks and return the results. 95 Returns 96 ------- 97 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks. 98 """ 99 current_profile = self.get_document_profile() 100 inconsistency_metric = self.compute_fact_checks( 101 facts=current_profile.qnaPairs.answers 102 ) 103 pii_metric = self.pii_presence_check() 104 custom_metric_res = [] 105 for custom_metric in self._custom_metrics: 106 custom_metric_output = self.get_custom_metric(custom_metric) 107 custom_metric_res.append( 108 CustomMetricResult(name=custom_metric.name, result=custom_metric_output) 109 ) 110 return DocumentQualityCheckResult( 111 profile=current_profile, 112 inconsistency=inconsistency_metric, 113 pii=pii_metric, 114 customMetrics=custom_metric_res if custom_metric_res else None, 115 )
Run the document quality checks and return the results.
Returns
DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
117 def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult: 118 """Compare the document quality against a reference profile. 119 Parameters 120 ---------- 121 reference_profile : DocumentProfile 122 The reference profile to compare against. 123 Returns 124 ------- 125 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile. 126 """ 127 incompleteness = self.incompleteness_metric( 128 questions=reference_profile.qnaPairs.questions 129 ) 130 if self.profile is None: 131 self.profile = self.get_document_profile() 132 inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers) 133 pii = self.pii_presence_check() 134 inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers) 135 136 return DocumentQualityCheckResult( 137 profile=self.profile, 138 inconsistency=inconsistency, 139 incompleteness=incompleteness, 140 pii=pii, 141 inaccuracy=inaccuracy, 142 )
Compare the document quality against a reference profile.
Parameters
reference_profile : DocumentProfile The reference profile to compare against.
Returns
DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
144 def get_response_from_llm( 145 self, msg: str, output_model: Optional[type[BaseModel]] = None 146 ) -> Union[BaseModel, str]: 147 """get response from LLM for a given message and output model 148 Parameters 149 ---------- 150 msg : str 151 The message to send to the LLM 152 output_model : Type[BaseModel], optional 153 pydantic model to parse the output, by default None. 154 155 Returns 156 ------- 157 a pydantic model instance 158 """ 159 160 res = self.llm_client.run_sync(msg, output_type=output_model) 161 return res.output
get response from LLM for a given message and output model
Parameters
msg : str The message to send to the LLM output_model : Type[BaseModel], optional pydantic model to parse the output, by default None.
Returns
a pydantic model instance
163 def extract_qna(self) -> QnAPairs: 164 """extract pairs of questions and answers from a document 165 Returns: 166 ------- 167 QnAPairs: a pydantic model containing the list of questions and answers 168 169 """ 170 prompt = QNA_EXTRACT_PROMPT.format( 171 document=self.document, 172 output_schema=QnAPairs.model_json_schema(), 173 num_questions=self._num_questions, 174 ) 175 resp = self.get_response_from_llm(prompt, QnAPairs) 176 return resp
extract pairs of questions and answers from a document
Returns:
QnAPairs: a pydantic model containing the list of questions and answers
178 def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts: 179 """Checks whether the provided facts are consistent against the document 180 Parameters 181 ---------- 182 facts : list[str] 183 The list of facts to check against the document 184 Returns 185 ------- 186 InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any 187 """ 188 prompt = FACT_CHECK_PROMPT.format( 189 document=self.document, 190 output_schema=InconsistentFacts.model_json_schema(), 191 facts=facts, 192 ) 193 resp = self.get_response_from_llm(prompt, InconsistentFacts) 194 return resp
Checks whether the provided facts are consistent against the document
Parameters
facts : list[str] The list of facts to check against the document
Returns
InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
196 def incompleteness_metric(self, questions: list[str]) -> MissingQuestions: 197 """check for questions not answered in a document 198 Parameters 199 ---------- 200 questions : list[str] 201 The list of questions to check against the document 202 Returns 203 ------- 204 MissingQuestions: a pydantic model containing the list of questions not answered in the document 205 """ 206 prompt = MISSING_QUESTIONS_PROMPT.format( 207 document=self.document, 208 questions=questions, 209 output_schema=MissingQuestions.model_json_schema(), 210 ) 211 resp = self.get_response_from_llm(prompt, MissingQuestions) 212 return resp
check for questions not answered in a document
Parameters
questions : list[str] The list of questions to check against the document
Returns
MissingQuestions: a pydantic model containing the list of questions not answered in the document
214 def pii_presence_check(self) -> PIIPresence: 215 """check for presence of PII in a document 216 217 Returns 218 ------- 219 PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found 220 """ 221 prompt = PII_PRESENCE_CHECK_PROMPT.format( 222 document=self.document, output_schema=PIIPresence.model_json_schema() 223 ) 224 resp = self.get_response_from_llm(prompt, PIIPresence) 225 return resp
check for presence of PII in a document
Returns
PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
227 def get_word_count(self) -> int: 228 """get the word count of a document 229 Returns 230 ------- 231 int: the number of words in the document 232 """ 233 234 content = self.document 235 words = content.strip().split() 236 return len(words)
get the word count of a document
Returns
int: the number of words in the document
238 def get_doc_summary(self) -> str: 239 """get a summary of a document 240 Returns 241 ------- 242 str: the summary of the document 243 """ 244 prompt = SUMMARY_PROMPT.format(document=self.document) 245 resp = self.get_response_from_llm(prompt) 246 return resp
get a summary of a document
Returns
str: the summary of the document
248 def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel: 249 """get a custom metric for a document""" 250 prompt = CUSTOM_METRIC_PROMPT.format( 251 document=self.document, 252 output_schema=custom_metric.outputModel.model_json_schema(), 253 prompt=custom_metric.prompt, 254 ) 255 resp = self.get_response_from_llm(prompt, custom_metric.outputModel) 256 return resp
get a custom metric for a document
258 def get_document_profile(self) -> DocumentProfile: 259 """get the profile of a document 260 Returns 261 ------- 262 DocumentProfile: a pydantic model containing profile of the document 263 """ 264 if self.profile: 265 return self.profile 266 267 qna = self.extract_qna() 268 word_count = self.get_word_count() 269 summary = self.get_doc_summary() 270 title = os.path.basename(self.file_path) 271 file_type = Path(self.file_path).suffix 272 size = Path(self.file_path).stat().st_size 273 274 self.profile = DocumentProfile( 275 title=title, 276 wordCount=word_count, 277 qnaPairs=qna, 278 summary=summary, 279 fileType=file_type, 280 fileSize=size, 281 ) 282 return self.profile
get the profile of a document
Returns
DocumentProfile: a pydantic model containing profile of the document