Coverage for src / kemi / graph.py: 100%

80 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-06-05 15:47 +0000

1"""Memory graph: entity and relation extraction using local heuristics. 

2 

3Zero external dependencies. Uses regex and simple linguistic heuristics. 

4""" 

5 

6import re 

7from typing import Any 

8 

9from kemi.models import LifecycleState 

10 

11# Common entity patterns 

12_ENTITY_PATTERNS = [ 

13 # Capitalized phrases (names, places, organizations) 

14 re.compile(r"\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b"), 

15] 

16 

17# Relation indicators 

18_RELATION_KEYWORDS = { 

19 "lives": "LOCATED_AT", 

20 "live": "LOCATED_AT", 

21 "works": "WORKS_AT", 

22 "work": "WORKS_AT", 

23 "likes": "PREFERS", 

24 "like": "PREFERS", 

25 "loves": "PREFERS", 

26 "love": "PREFERS", 

27 "hates": "DISLIKES", 

28 "hate": "DISLIKES", 

29 "prefers": "PREFERS", 

30 "prefer": "PREFERS", 

31 "enjoys": "ENJOYS", 

32 "enjoy": "ENJOYS", 

33 "uses": "USES", 

34 "use": "USES", 

35 "studies": "STUDIES", 

36 "study": "STUDIES", 

37 "born": "BORN_IN", 

38 "from": "ORIGIN", 

39 "visited": "VISITED", 

40 "visit": "VISITED", 

41 "traveled": "VISITED", 

42 "travel": "VISITED", 

43} 

44 

45 

46def extract_entities(text: str) -> list[dict[str, Any]]: 

47 """Extract named entities from text using heuristics. 

48 

49 Args: 

50 text: Input text. 

51 

52 Returns: 

53 List of entity dicts with keys: text, label, start, end. 

54 """ 

55 entities: list[dict[str, Any]] = [] 

56 seen: set[str] = set() 

57 

58 # Pattern-based extraction 

59 for pattern in _ENTITY_PATTERNS: 

60 for match in pattern.finditer(text): 

61 entity_text = match.group() 

62 if entity_text in seen: 

63 continue 

64 seen.add(entity_text) 

65 

66 # Simple label guessing 

67 label = _guess_entity_label(entity_text) 

68 

69 entities.append( 

70 { 

71 "text": entity_text, 

72 "label": label, 

73 "start": match.start(), 

74 "end": match.end(), 

75 } 

76 ) 

77 

78 # Extract email addresses 

79 for match in re.finditer(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text): 

80 entities.append( 

81 { 

82 "text": match.group(), 

83 "label": "EMAIL", 

84 "start": match.start(), 

85 "end": match.end(), 

86 } 

87 ) 

88 

89 # Extract URLs 

90 for match in re.finditer(r"https?://[^\s]+|www\.[^\s]+", text): 

91 entities.append( 

92 { 

93 "text": match.group(), 

94 "label": "URL", 

95 "start": match.start(), 

96 "end": match.end(), 

97 } 

98 ) 

99 

100 return entities 

101 

102 

103def _guess_entity_label(text: str) -> str: 

104 """Guess entity label based on simple heuristics.""" 

105 text_lower = text.lower() 

106 

107 # Location indicators 

108 location_suffixes = ["city", "town", "village", "country", "state", "river", "mountain"] 

109 if any(text_lower.endswith(s) for s in location_suffixes): 

110 return "LOCATION" 

111 

112 # Organization indicators 

113 org_indicators = ["inc", "corp", "llc", "ltd", "company", "org", "university", "school"] 

114 if any(ind in text_lower for ind in org_indicators): 

115 return "ORGANIZATION" 

116 

117 # Person names (simple heuristic: 1-3 words, common first name) 

118 common_names = { 

119 "john", 

120 "jane", 

121 "mary", 

122 "james", 

123 "robert", 

124 "michael", 

125 "william", 

126 "david", 

127 "richard", 

128 "joseph", 

129 "thomas", 

130 "charles", 

131 "daniel", 

132 "matthew", 

133 "anthony", 

134 "mark", 

135 "donald", 

136 "steven", 

137 "paul", 

138 "andrew", 

139 "kenneth", 

140 "joshua", 

141 "kevin", 

142 "brian", 

143 "george", 

144 "edward", 

145 "ronald", 

146 "timothy", 

147 "jason", 

148 "jeffrey", 

149 "ryan", 

150 "jacob", 

151 "gary", 

152 "nicholas", 

153 "eric", 

154 "jonathan", 

155 "stephen", 

156 "larry", 

157 "justin", 

158 "scott", 

159 "brandon", 

160 "benjamin", 

161 "samuel", 

162 "frank", 

163 "gregory", 

164 "raymond", 

165 "alexander", 

166 "patrick", 

167 "jack", 

168 "dennis", 

169 "jerry", 

170 "tyler", 

171 "aaron", 

172 "jose", 

173 "adam", 

174 "nathan", 

175 "henry", 

176 "zachary", 

177 "douglas", 

178 "peter", 

179 "kyle", 

180 "walter", 

181 "ethan", 

182 "jeremy", 

183 "harold", 

184 "keith", 

185 "christian", 

186 "roger", 

187 "noah", 

188 "gerald", 

189 "carl", 

190 "terry", 

191 "sean", 

192 "austin", 

193 "arthur", 

194 "lawrence", 

195 "jesse", 

196 "dylan", 

197 "bryan", 

198 "joe", 

199 "jordan", 

200 "billy", 

201 "bruce", 

202 "albert", 

203 "willie", 

204 "gabriel", 

205 "logan", 

206 "alan", 

207 "juan", 

208 "wayne", 

209 "roy", 

210 "ralph", 

211 "randy", 

212 "eugene", 

213 "vincent", 

214 "russell", 

215 "elijah", 

216 "louis", 

217 "bobby", 

218 "philip", 

219 "johnny", 

220 "patricia", 

221 "jennifer", 

222 "linda", 

223 "elizabeth", 

224 "susan", 

225 "jessica", 

226 "sarah", 

227 "karen", 

228 "nancy", 

229 "lisa", 

230 "betty", 

231 "margaret", 

232 "sandra", 

233 "ashley", 

234 "kimberly", 

235 "emily", 

236 "donna", 

237 "michelle", 

238 "dorothy", 

239 "carol", 

240 "amanda", 

241 "melissa", 

242 "deborah", 

243 "stephanie", 

244 "rebecca", 

245 "laura", 

246 "sharon", 

247 "cynthia", 

248 "kathleen", 

249 "amy", 

250 "shirley", 

251 "angela", 

252 "helen", 

253 "anna", 

254 "brenda", 

255 "pamela", 

256 "nicole", 

257 "emma", 

258 "samantha", 

259 "katherine", 

260 "christine", 

261 "debra", 

262 "rachel", 

263 "catherine", 

264 "carolyn", 

265 "janet", 

266 "ruth", 

267 "maria", 

268 "heather", 

269 "diane", 

270 "virginia", 

271 "julie", 

272 "joyce", 

273 "victoria", 

274 "olivia", 

275 "kelly", 

276 "christina", 

277 "lauren", 

278 "joan", 

279 "evelyn", 

280 "judith", 

281 "megan", 

282 "cheryl", 

283 "andrea", 

284 "hannah", 

285 "martha", 

286 "jacqueline", 

287 "frances", 

288 "gloria", 

289 "ann", 

290 "teresa", 

291 "kathryn", 

292 "sara", 

293 "janice", 

294 "jean", 

295 "alice", 

296 "madison", 

297 "doris", 

298 "abigail", 

299 "julia", 

300 "judy", 

301 "grace", 

302 "denise", 

303 "amber", 

304 "marilyn", 

305 "beverly", 

306 "danielle", 

307 "theresa", 

308 "sophia", 

309 "marie", 

310 "diana", 

311 "brittany", 

312 "natalie", 

313 "isabella", 

314 "charlotte", 

315 "rose", 

316 "alexis", 

317 "kayla", 

318 } 

319 

320 words = text.split() 

321 if len(words) <= 3: 

322 first_word = words[0].lower() 

323 if first_word in common_names: 

324 return "PERSON" 

325 

326 # Default 

327 return "ENTITY" 

328 

329 

330def extract_relations(text: str, entities: list[dict[str, Any]]) -> list[dict[str, Any]]: 

331 """Extract relations between entities in text. 

332 

333 Args: 

334 text: Input text. 

335 entities: Pre-extracted entities. 

336 

337 Returns: 

338 List of relation dicts with subject, predicate, object. 

339 """ 

340 relations: list[dict[str, Any]] = [] 

341 text_lower = text.lower() 

342 

343 # Find relation keywords and link nearby entities 

344 for keyword, predicate in _RELATION_KEYWORDS.items(): 

345 for match in re.finditer(rf"\b{keyword}\b", text_lower): 

346 keyword_pos = match.start() 

347 

348 # Find nearest entity before keyword 

349 subject = _find_nearest_entity(entities, keyword_pos, before=True) 

350 obj = _find_nearest_entity(entities, keyword_pos, before=False) 

351 

352 if subject and obj and subject["text"] != obj["text"]: 

353 relations.append( 

354 { 

355 "subject": subject["text"], 

356 "predicate": predicate, 

357 "object": obj["text"], 

358 "confidence": 0.6, 

359 } 

360 ) 

361 

362 return relations 

363 

364 

365def _find_nearest_entity( 

366 entities: list[dict[str, Any]], 

367 position: int, 

368 before: bool = True, 

369) -> dict[str, Any] | None: 

370 """Find the nearest entity to a position.""" 

371 best = None 

372 best_dist = float("inf") 

373 

374 for ent in entities: 

375 if before: 

376 dist = position - ent["end"] 

377 else: 

378 dist = ent["start"] - position 

379 

380 if dist >= 0 and dist < best_dist: 

381 best_dist = dist 

382 best = ent 

383 

384 return best 

385 

386 

387def build_memory_graph( 

388 store: Any, 

389 user_id: str, 

390 namespace: str = "default", 

391) -> dict[str, Any]: 

392 """Build a memory graph from all of a user's memories. 

393 

394 Args: 

395 store: StorageAdapter instance. 

396 user_id: User ID. 

397 namespace: Memory namespace. 

398 

399 Returns: 

400 Dict with 'entities' and 'relations' keys. 

401 """ 

402 memories = store.get_all_by_user( 

403 user_id, 

404 lifecycle_filter=[LifecycleState.ACTIVE, LifecycleState.DECAYING], 

405 namespace=namespace, 

406 ) 

407 

408 all_entities: list[dict[str, Any]] = [] 

409 all_relations: list[dict[str, Any]] = [] 

410 seen_entities: set[str] = set() 

411 

412 for mem in memories: 

413 entities = extract_entities(mem.content) 

414 relations = extract_relations(mem.content, entities) 

415 

416 for ent in entities: 

417 key = f"{ent['text']}:{ent['label']}" 

418 if key not in seen_entities: 

419 seen_entities.add(key) 

420 all_entities.append(ent) 

421 

422 all_relations.extend(relations) 

423 

424 # Deduplicate relations 

425 unique_relations: list[dict[str, Any]] = [] 

426 seen_relations: set[str] = set() 

427 for rel in all_relations: 

428 key = f"{rel['subject']}:{rel['predicate']}:{rel['object']}" 

429 if key not in seen_relations: 

430 seen_relations.add(key) 

431 unique_relations.append(rel) 

432 

433 return { 

434 "entities": all_entities, 

435 "relations": unique_relations, 

436 }