Coverage for src / kemi / graph.py: 100%
80 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-05 15:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-05 15:47 +0000
1"""Memory graph: entity and relation extraction using local heuristics.
3Zero external dependencies. Uses regex and simple linguistic heuristics.
4"""
6import re
7from typing import Any
9from kemi.models import LifecycleState
11# Common entity patterns
12_ENTITY_PATTERNS = [
13 # Capitalized phrases (names, places, organizations)
14 re.compile(r"\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b"),
15]
17# Relation indicators
18_RELATION_KEYWORDS = {
19 "lives": "LOCATED_AT",
20 "live": "LOCATED_AT",
21 "works": "WORKS_AT",
22 "work": "WORKS_AT",
23 "likes": "PREFERS",
24 "like": "PREFERS",
25 "loves": "PREFERS",
26 "love": "PREFERS",
27 "hates": "DISLIKES",
28 "hate": "DISLIKES",
29 "prefers": "PREFERS",
30 "prefer": "PREFERS",
31 "enjoys": "ENJOYS",
32 "enjoy": "ENJOYS",
33 "uses": "USES",
34 "use": "USES",
35 "studies": "STUDIES",
36 "study": "STUDIES",
37 "born": "BORN_IN",
38 "from": "ORIGIN",
39 "visited": "VISITED",
40 "visit": "VISITED",
41 "traveled": "VISITED",
42 "travel": "VISITED",
43}
46def extract_entities(text: str) -> list[dict[str, Any]]:
47 """Extract named entities from text using heuristics.
49 Args:
50 text: Input text.
52 Returns:
53 List of entity dicts with keys: text, label, start, end.
54 """
55 entities: list[dict[str, Any]] = []
56 seen: set[str] = set()
58 # Pattern-based extraction
59 for pattern in _ENTITY_PATTERNS:
60 for match in pattern.finditer(text):
61 entity_text = match.group()
62 if entity_text in seen:
63 continue
64 seen.add(entity_text)
66 # Simple label guessing
67 label = _guess_entity_label(entity_text)
69 entities.append(
70 {
71 "text": entity_text,
72 "label": label,
73 "start": match.start(),
74 "end": match.end(),
75 }
76 )
78 # Extract email addresses
79 for match in re.finditer(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text):
80 entities.append(
81 {
82 "text": match.group(),
83 "label": "EMAIL",
84 "start": match.start(),
85 "end": match.end(),
86 }
87 )
89 # Extract URLs
90 for match in re.finditer(r"https?://[^\s]+|www\.[^\s]+", text):
91 entities.append(
92 {
93 "text": match.group(),
94 "label": "URL",
95 "start": match.start(),
96 "end": match.end(),
97 }
98 )
100 return entities
103def _guess_entity_label(text: str) -> str:
104 """Guess entity label based on simple heuristics."""
105 text_lower = text.lower()
107 # Location indicators
108 location_suffixes = ["city", "town", "village", "country", "state", "river", "mountain"]
109 if any(text_lower.endswith(s) for s in location_suffixes):
110 return "LOCATION"
112 # Organization indicators
113 org_indicators = ["inc", "corp", "llc", "ltd", "company", "org", "university", "school"]
114 if any(ind in text_lower for ind in org_indicators):
115 return "ORGANIZATION"
117 # Person names (simple heuristic: 1-3 words, common first name)
118 common_names = {
119 "john",
120 "jane",
121 "mary",
122 "james",
123 "robert",
124 "michael",
125 "william",
126 "david",
127 "richard",
128 "joseph",
129 "thomas",
130 "charles",
131 "daniel",
132 "matthew",
133 "anthony",
134 "mark",
135 "donald",
136 "steven",
137 "paul",
138 "andrew",
139 "kenneth",
140 "joshua",
141 "kevin",
142 "brian",
143 "george",
144 "edward",
145 "ronald",
146 "timothy",
147 "jason",
148 "jeffrey",
149 "ryan",
150 "jacob",
151 "gary",
152 "nicholas",
153 "eric",
154 "jonathan",
155 "stephen",
156 "larry",
157 "justin",
158 "scott",
159 "brandon",
160 "benjamin",
161 "samuel",
162 "frank",
163 "gregory",
164 "raymond",
165 "alexander",
166 "patrick",
167 "jack",
168 "dennis",
169 "jerry",
170 "tyler",
171 "aaron",
172 "jose",
173 "adam",
174 "nathan",
175 "henry",
176 "zachary",
177 "douglas",
178 "peter",
179 "kyle",
180 "walter",
181 "ethan",
182 "jeremy",
183 "harold",
184 "keith",
185 "christian",
186 "roger",
187 "noah",
188 "gerald",
189 "carl",
190 "terry",
191 "sean",
192 "austin",
193 "arthur",
194 "lawrence",
195 "jesse",
196 "dylan",
197 "bryan",
198 "joe",
199 "jordan",
200 "billy",
201 "bruce",
202 "albert",
203 "willie",
204 "gabriel",
205 "logan",
206 "alan",
207 "juan",
208 "wayne",
209 "roy",
210 "ralph",
211 "randy",
212 "eugene",
213 "vincent",
214 "russell",
215 "elijah",
216 "louis",
217 "bobby",
218 "philip",
219 "johnny",
220 "patricia",
221 "jennifer",
222 "linda",
223 "elizabeth",
224 "susan",
225 "jessica",
226 "sarah",
227 "karen",
228 "nancy",
229 "lisa",
230 "betty",
231 "margaret",
232 "sandra",
233 "ashley",
234 "kimberly",
235 "emily",
236 "donna",
237 "michelle",
238 "dorothy",
239 "carol",
240 "amanda",
241 "melissa",
242 "deborah",
243 "stephanie",
244 "rebecca",
245 "laura",
246 "sharon",
247 "cynthia",
248 "kathleen",
249 "amy",
250 "shirley",
251 "angela",
252 "helen",
253 "anna",
254 "brenda",
255 "pamela",
256 "nicole",
257 "emma",
258 "samantha",
259 "katherine",
260 "christine",
261 "debra",
262 "rachel",
263 "catherine",
264 "carolyn",
265 "janet",
266 "ruth",
267 "maria",
268 "heather",
269 "diane",
270 "virginia",
271 "julie",
272 "joyce",
273 "victoria",
274 "olivia",
275 "kelly",
276 "christina",
277 "lauren",
278 "joan",
279 "evelyn",
280 "judith",
281 "megan",
282 "cheryl",
283 "andrea",
284 "hannah",
285 "martha",
286 "jacqueline",
287 "frances",
288 "gloria",
289 "ann",
290 "teresa",
291 "kathryn",
292 "sara",
293 "janice",
294 "jean",
295 "alice",
296 "madison",
297 "doris",
298 "abigail",
299 "julia",
300 "judy",
301 "grace",
302 "denise",
303 "amber",
304 "marilyn",
305 "beverly",
306 "danielle",
307 "theresa",
308 "sophia",
309 "marie",
310 "diana",
311 "brittany",
312 "natalie",
313 "isabella",
314 "charlotte",
315 "rose",
316 "alexis",
317 "kayla",
318 }
320 words = text.split()
321 if len(words) <= 3:
322 first_word = words[0].lower()
323 if first_word in common_names:
324 return "PERSON"
326 # Default
327 return "ENTITY"
330def extract_relations(text: str, entities: list[dict[str, Any]]) -> list[dict[str, Any]]:
331 """Extract relations between entities in text.
333 Args:
334 text: Input text.
335 entities: Pre-extracted entities.
337 Returns:
338 List of relation dicts with subject, predicate, object.
339 """
340 relations: list[dict[str, Any]] = []
341 text_lower = text.lower()
343 # Find relation keywords and link nearby entities
344 for keyword, predicate in _RELATION_KEYWORDS.items():
345 for match in re.finditer(rf"\b{keyword}\b", text_lower):
346 keyword_pos = match.start()
348 # Find nearest entity before keyword
349 subject = _find_nearest_entity(entities, keyword_pos, before=True)
350 obj = _find_nearest_entity(entities, keyword_pos, before=False)
352 if subject and obj and subject["text"] != obj["text"]:
353 relations.append(
354 {
355 "subject": subject["text"],
356 "predicate": predicate,
357 "object": obj["text"],
358 "confidence": 0.6,
359 }
360 )
362 return relations
365def _find_nearest_entity(
366 entities: list[dict[str, Any]],
367 position: int,
368 before: bool = True,
369) -> dict[str, Any] | None:
370 """Find the nearest entity to a position."""
371 best = None
372 best_dist = float("inf")
374 for ent in entities:
375 if before:
376 dist = position - ent["end"]
377 else:
378 dist = ent["start"] - position
380 if dist >= 0 and dist < best_dist:
381 best_dist = dist
382 best = ent
384 return best
387def build_memory_graph(
388 store: Any,
389 user_id: str,
390 namespace: str = "default",
391) -> dict[str, Any]:
392 """Build a memory graph from all of a user's memories.
394 Args:
395 store: StorageAdapter instance.
396 user_id: User ID.
397 namespace: Memory namespace.
399 Returns:
400 Dict with 'entities' and 'relations' keys.
401 """
402 memories = store.get_all_by_user(
403 user_id,
404 lifecycle_filter=[LifecycleState.ACTIVE, LifecycleState.DECAYING],
405 namespace=namespace,
406 )
408 all_entities: list[dict[str, Any]] = []
409 all_relations: list[dict[str, Any]] = []
410 seen_entities: set[str] = set()
412 for mem in memories:
413 entities = extract_entities(mem.content)
414 relations = extract_relations(mem.content, entities)
416 for ent in entities:
417 key = f"{ent['text']}:{ent['label']}"
418 if key not in seen_entities:
419 seen_entities.add(key)
420 all_entities.append(ent)
422 all_relations.extend(relations)
424 # Deduplicate relations
425 unique_relations: list[dict[str, Any]] = []
426 seen_relations: set[str] = set()
427 for rel in all_relations:
428 key = f"{rel['subject']}:{rel['predicate']}:{rel['object']}"
429 if key not in seen_relations:
430 seen_relations.add(key)
431 unique_relations.append(rel)
433 return {
434 "entities": all_entities,
435 "relations": unique_relations,
436 }