Coverage for src / tracekit / analyzers / patterns / __init__.py: 75%

45 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 23:04 +0000

1"""Pattern Detection & Analysis module for TraceKit. 

2 

3This module provides comprehensive pattern detection and analysis capabilities 

4for digital signals and binary data, including: 

5 

6- Periodic pattern detection (autocorrelation, FFT, suffix array) 

7- Repeating sequence detection (n-grams, LRS, approximate matching) 

8- Automatic signature discovery (headers, delimiters, magic bytes) 

9- Pattern clustering by similarity (Hamming, edit distance, hierarchical) 

10- Binary regex pattern matching 

11- Multi-pattern search (Aho-Corasick) 

12- Fuzzy/approximate pattern matching 

13- Pattern learning and discovery 

14 

15 - RE-PAT-001: Binary Regex Pattern Matching 

16 - RE-PAT-002: Multi-Pattern Search (Aho-Corasick) 

17 - RE-PAT-003: Fuzzy Pattern Matching 

18 - RE-PAT-004: Pattern Learning and Discovery 

19 

20Author: TraceKit Development Team 

21""" 

22 

23# Periodic pattern detection (PAT-001) 

24# Pattern clustering (PAT-004) 

25from .clustering import ( 

26 ClusteringResult, 

27 ClusterResult, 

28 analyze_cluster, 

29 cluster_by_edit_distance, 

30 cluster_by_hamming, 

31 cluster_hierarchical, 

32 compute_distance_matrix, 

33) 

34 

35# Signature discovery (PAT-003) 

36from .discovery import ( 

37 CandidateSignature, 

38 SignatureDiscovery, 

39 discover_signatures, 

40 find_delimiter_candidates, 

41 find_header_candidates, 

42) 

43 

44# RE-PAT-004: Pattern Learning and Discovery 

45from .learning import ( 

46 LearnedPattern, 

47 NgramModel, 

48 PatternLearner, 

49 StructureHypothesis, 

50 find_recurring_structures, 

51 infer_structure, 

52 learn_patterns_from_data, 

53) 

54 

55# RE-PAT-001, RE-PAT-002, RE-PAT-003: Advanced pattern matching 

56from .matching import ( 

57 AhoCorasickMatcher, 

58 # Classes 

59 BinaryRegex, 

60 FuzzyMatcher, 

61 FuzzyMatchResult, 

62 # Data classes 

63 PatternMatchResult, 

64 # RE-PAT-001: Binary Regex 

65 binary_regex_search, 

66 count_pattern_occurrences, 

67 # Utilities 

68 find_pattern_positions, 

69 find_similar_sequences, 

70 # RE-PAT-003: Fuzzy Matching 

71 fuzzy_search, 

72 # RE-PAT-002: Multi-Pattern Search 

73 multi_pattern_search, 

74) 

75from .periodic import ( 

76 PeriodicPatternDetector, 

77 PeriodResult, 

78 detect_period, 

79 detect_periods_autocorr, 

80 detect_periods_fft, 

81 validate_period, 

82) 

83 

84# Alias for backward compatibility 

85detect_period_autocorr = detect_periods_autocorr 

86detect_period_fft = detect_periods_fft 

87 

88# Repeating sequence detection (PAT-002) 

89# Motif detection functions (aliases for test compatibility) 

90from typing import TYPE_CHECKING, Any, cast 

91 

92from .sequences import ( 

93 NgramResult, 

94 RepeatingSequence, 

95 find_approximate_repeats, 

96 find_frequent_ngrams, 

97 find_longest_repeat, 

98 find_repeating_sequences, 

99) 

100 

101if TYPE_CHECKING: 

102 import numpy as np 

103 from numpy.typing import NDArray 

104 

105 

106def find_motifs( 

107 data: Any, motif_length: int = 8, max_distance: float = 0.1 

108) -> list[RepeatingSequence]: 

109 """Find motifs (repeating patterns) in data. 

110 

111 This is an alias for find_repeating_sequences for test compatibility. 

112 

113 Args: 

114 data: Input data array. 

115 motif_length: Length of motifs to find. 

116 max_distance: Maximum distance for fuzzy matching (unused). 

117 

118 Returns: 

119 List of RepeatingSequence objects. 

120 """ 

121 import numpy as np 

122 

123 data = np.asarray(data) 

124 results = find_repeating_sequences(data, min_length=motif_length, min_count=2) 

125 return results 

126 

127 

128def extract_motif(data: Any, start: int, length: int) -> "NDArray[np.generic]": 

129 """Extract a motif from data. 

130 

131 Args: 

132 data: Input data array. 

133 start: Start index. 

134 length: Length to extract. 

135 

136 Returns: 

137 Extracted motif as numpy array. 

138 """ 

139 import numpy as np 

140 from numpy.typing import NDArray 

141 

142 data_arr = np.asarray(data) 

143 result: NDArray[np.generic] = data_arr[start : start + length] 

144 return result 

145 

146 

147def detect_anomalies(data: Any, threshold: float = 3.0) -> list[int]: 

148 """Detect anomalies in data using z-score. 

149 

150 Args: 

151 data: Input data array. 

152 threshold: Z-score threshold for anomaly detection. 

153 

154 Returns: 

155 List of anomaly indices. 

156 """ 

157 import numpy as np 

158 

159 data_arr = np.asarray(data) 

160 mean = np.mean(data_arr) 

161 std = np.std(data_arr) 

162 if std == 0: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 return [] 

164 

165 z_scores = np.abs((data_arr - mean) / std) 

166 indices = np.where(z_scores > threshold)[0].tolist() 

167 return cast("list[int]", indices) 

168 

169 

170def cluster_patterns(patterns: Any, method: str = "hamming") -> ClusteringResult: 

171 """Cluster patterns by similarity. 

172 

173 Args: 

174 patterns: List of patterns to cluster. 

175 method: Clustering method ('hamming' or 'edit'). 

176 

177 Returns: 

178 ClusteringResult with cluster assignments. 

179 """ 

180 if method == "hamming": 180 ↛ 183line 180 didn't jump to line 183 because the condition on line 180 was always true

181 return cluster_by_hamming(patterns) 

182 else: 

183 return cluster_by_edit_distance(patterns) 

184 

185 

186def pattern_similarity(pattern1: Any, pattern2: Any) -> float: 

187 """Calculate similarity between two patterns. 

188 

189 Args: 

190 pattern1: First pattern. 

191 pattern2: Second pattern. 

192 

193 Returns: 

194 Similarity score (0-1, 1 = identical). 

195 """ 

196 import numpy as np 

197 

198 p1 = np.asarray(pattern1) 

199 p2 = np.asarray(pattern2) 

200 

201 if len(p1) != len(p2): 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 return 0.0 

203 

204 if len(p1) == 0: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 return 1.0 

206 

207 matches = int(np.sum(p1 == p2)) 

208 return float(matches / len(p1)) 

209 

210 

211__all__ = [ 

212 # RE-PAT-002: Multi-Pattern Search 

213 "AhoCorasickMatcher", 

214 # RE-PAT-001: Binary Regex Pattern Matching 

215 "BinaryRegex", 

216 "CandidateSignature", 

217 "ClusterResult", 

218 "ClusteringResult", 

219 "FuzzyMatchResult", 

220 # RE-PAT-003: Fuzzy Pattern Matching 

221 "FuzzyMatcher", 

222 # RE-PAT-004: Pattern Learning and Discovery 

223 "LearnedPattern", 

224 "NgramModel", 

225 "NgramResult", 

226 "PatternLearner", 

227 "PatternMatchResult", 

228 "PeriodResult", 

229 "PeriodicPatternDetector", 

230 "RepeatingSequence", 

231 "SignatureDiscovery", 

232 "StructureHypothesis", 

233 "analyze_cluster", 

234 "binary_regex_search", 

235 "cluster_by_edit_distance", 

236 "cluster_by_hamming", 

237 "cluster_hierarchical", 

238 "cluster_patterns", 

239 "compute_distance_matrix", 

240 "count_pattern_occurrences", 

241 # Motif detection (compatibility) 

242 "detect_anomalies", 

243 "detect_period", 

244 "detect_period_autocorr", 

245 "detect_period_fft", 

246 "detect_periods_autocorr", 

247 "detect_periods_fft", 

248 "discover_signatures", 

249 "extract_motif", 

250 "find_approximate_repeats", 

251 "find_delimiter_candidates", 

252 "find_frequent_ngrams", 

253 "find_header_candidates", 

254 "find_longest_repeat", 

255 "find_motifs", 

256 "find_pattern_positions", 

257 "find_recurring_structures", 

258 "find_repeating_sequences", 

259 "find_similar_sequences", 

260 "fuzzy_search", 

261 "infer_structure", 

262 "learn_patterns_from_data", 

263 "multi_pattern_search", 

264 "pattern_similarity", 

265 "validate_period", 

266]