Coverage for src/csv_schema_validator/tests/test_validate_csv.py: 98%

221 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-12-23 15:34 +0100

1import pytest 

2import tempfile 

3import os 

4 

5from csv_schema_validator.validate_csv import validate_csv 

6 

7 

8class TestValidateCSV: 

9 """Test suite for the validate_csv function""" 

10 

11 @pytest.fixture 

12 def temp_dir(self): 

13 """Create a temporary directory for test files""" 

14 with tempfile.TemporaryDirectory() as tmpdir: 

15 yield tmpdir 

16 

17 @pytest.fixture 

18 def basic_schema(self): 

19 """Create a basic schema file for testing""" 

20 return { 

21 "name": "Test Schema", 

22 "description": "Basic test schema", 

23 "fields": [ 

24 { 

25 "name": "id", 

26 "type": "integer", 

27 "required": True, 

28 "description": "Unique identifier", 

29 }, 

30 { 

31 "name": "name", 

32 "type": "string", 

33 "required": True, 

34 "description": "Name field", 

35 }, 

36 { 

37 "name": "email", 

38 "type": "string", 

39 "required": True, 

40 "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", 

41 "description": "Email address", 

42 }, 

43 { 

44 "name": "department", 

45 "type": "string", 

46 "required": True, 

47 "enum": ["Engineering", "Marketing", "Sales"], 

48 "description": "Department", 

49 }, 

50 { 

51 "name": "salary", 

52 "type": "number", 

53 "required": True, 

54 "min": 30000, 

55 "max": 200000, 

56 "description": "Salary", 

57 }, 

58 { 

59 "name": "is_active", 

60 "type": "boolean", 

61 "required": True, 

62 "description": "Active status", 

63 }, 

64 ], 

65 } 

66 

67 @pytest.fixture 

68 def invalid_schema(self): 

69 """Create an invalid schema file for testing""" 

70 return { 

71 "name": "Invalid Schema", 

72 "description": "Invalid schema", 

73 "fields": [ 

74 { 

75 "name": "id", 

76 "required": True, 

77 }, 

78 ], 

79 "invalid_field": "invalid_field", 

80 } 

81 

82 @pytest.fixture 

83 def non_json_non_text_schema(self): 

84 """Create a non JSON non text schema file for testing""" 

85 return 123 

86 

87 @pytest.fixture 

88 def non_json_text_schema(self): 

89 """Create a non JSON schema file for testing""" 

90 return "invalid schema" 

91 

92 @pytest.fixture 

93 def valid_csv(self, temp_dir): 

94 """Create a valid CSV file for testing""" 

95 csv_content = """id,name,email,department,salary,is_active 

961,John Doe,john.doe@company.com,Engineering,75000,true 

972,Jane Smith,jane.smith@company.com,Marketing,65000,false""" 

98 

99 csv_file = os.path.join(temp_dir, "valid.csv") 

100 with open(csv_file, "w") as f: 

101 f.write(csv_content) 

102 return csv_file 

103 

104 @pytest.fixture 

105 def invalid_csv(self, temp_dir): 

106 """Create an invalid CSV file for testing""" 

107 csv_content = """id,name,email,department,salary,is_active 

1081,John Doe,invalid-email,Engineering,75000,true 

1092,Jane Smith,jane.smith@company.com,InvalidDept,65000,false 

1103,invalid-id,Bob Johnson,bob@company.com,Sales,25000,maybe 

1114,Alice Williams,alice@company.com,Marketing,300000,true""" 

112 csv_file = os.path.join(temp_dir, "invalid.csv") 

113 with open(csv_file, "w") as f: 

114 f.write(csv_content) 

115 return csv_file 

116 

117 @pytest.fixture 

118 def non_matching_csv(self, temp_dir): 

119 """Create a non matching CSV file for testing""" 

120 csv_content = """id,namee,email,department,salary,is_active 

1211,John Doe,john.doe@company,Engineerin,75000,yes 

122two,Jane Smith,jane.smith@company.com,Marketing,65000,false""" 

123 csv_file = os.path.join(temp_dir, "non_matching.csv") 

124 with open(csv_file, "w") as f: 

125 f.write(csv_content) 

126 return csv_file 

127 

128 def test_validate_csv_empty_file(self, temp_dir, basic_schema): 

129 """Test validation of empty CSV file""" 

130 empty_csv = os.path.join(temp_dir, "empty.csv") 

131 with open(empty_csv, "w") as f: 

132 f.write("") 

133 

134 result = validate_csv(empty_csv, basic_schema) 

135 assert result == { 

136 "is_valid": False, 

137 "errors": [ 

138 { 

139 "error_type": "EmptyFileError", 

140 "error_message": "Csv file is empty", 

141 "row": None, 

142 "column": None, 

143 "value": None, 

144 "details": {"file_type": "CSV", "file_path": empty_csv}, 

145 } 

146 ], 

147 } 

148 

149 def test_validate_invalid_schema(self, valid_csv, invalid_schema): 

150 result = validate_csv(valid_csv, invalid_schema) 

151 assert result == { 

152 "is_valid": False, 

153 "errors": [ 

154 { 

155 "error_type": "SchemaValidationError", 

156 "error_message": "Field required", 

157 "row": None, 

158 "column": None, 

159 "value": None, 

160 "details": { 

161 "input": {"name": "id", "required": True}, 

162 "type": "missing" 

163 }, 

164 } 

165 ], 

166 "validated_schema": None, 

167 } 

168 

169 def test_non_json_text_schema(self, valid_csv, non_json_text_schema): 

170 result = validate_csv(valid_csv, non_json_text_schema) 

171 assert result == { 

172 "is_valid": False, 

173 "errors": [ 

174 { 

175 "error_type": "SchemaValidationError", 

176 "error_message": "csv_schema_validator.core.models.CSVSchema() argument after ** must be a mapping, not str", 

177 "row": None, 

178 "column": None, 

179 "value": None, 

180 "details": {"schema_dict": "invalid schema"}, 

181 } 

182 ], 

183 "validated_schema": None, 

184 } 

185 

186 def test_non_json_non_text_schema(self, valid_csv, non_json_non_text_schema): 

187 result = validate_csv(valid_csv, non_json_non_text_schema) 

188 assert result == { 

189 "is_valid": False, 

190 "errors": [ 

191 { 

192 "error_type": "SchemaValidationError", 

193 "error_message": "csv_schema_validator.core.models.CSVSchema() argument after ** must be a mapping, not int", 

194 "row": None, 

195 "column": None, 

196 "value": None, 

197 "details": {"schema_dict": 123}, 

198 } 

199 ], 

200 "validated_schema": None, 

201 } 

202 

203 def test_validate_valid_csv(self, valid_csv, basic_schema): 

204 result = validate_csv(valid_csv, basic_schema) 

205 assert result == { 

206 "is_valid": True, 

207 "errors": [], 

208 } 

209 

210 def test_validate_non_matching_csv(self, non_matching_csv, basic_schema): 

211 result = validate_csv(non_matching_csv, basic_schema) 

212 assert result == { 

213 "is_valid": False, 

214 "errors": [ 

215 { 

216 "error_type": "RequiredFieldError", 

217 "error_message": "Missing required fields: name", 

218 "row": -1, 

219 "column": "", 

220 "value": None, 

221 "details": { 

222 "required_fields": [ 

223 "id", 

224 "name", 

225 "email", 

226 "department", 

227 "salary", 

228 "is_active", 

229 ], 

230 "missing_fields": ["name"], 

231 "available_fields": [ 

232 "id", 

233 "namee", 

234 "email", 

235 "department", 

236 "salary", 

237 "is_active", 

238 ], 

239 }, 

240 }, 

241 { 

242 "error_type": "PatternValidationError", 

243 "error_message": "Field 'email' does not match required pattern", 

244 "value": "john.doe@company", 

245 "column": "email", 

246 "row": 2, 

247 "details": { 

248 "expected_pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" 

249 }, 

250 }, 

251 { 

252 "error_type": "EnumValidationError", 

253 "error_message": "Field 'department' value 'Engineerin' is not in allowed values", 

254 "value": "Engineerin", 

255 "column": "department", 

256 "row": 2, 

257 "details": {"allowed_values": ["Engineering", "Marketing", "Sales"]}, 

258 }, 

259 { 

260 "error_type": "TypeValidationError", 

261 "error_message": "Invalid type for field 'is_active': expected boolean, got str", 

262 "value": "yes", 

263 "column": "is_active", 

264 "row": 2, 

265 "details": {"supported_values": ["true", "false"]}, 

266 }, 

267 { 

268 "error_type": "TypeValidationError", 

269 "error_message": "Invalid type for field 'id': expected integer, got str", 

270 "value": "two", 

271 "column": "id", 

272 "row": 3, 

273 "details": {"expected_type": "integer", "actual_type": "str"}, 

274 }, 

275 ], 

276 } 

277 

278 def test_validate_csv_edge_case_data_types(self, temp_dir, basic_schema): 

279 """Test edge cases for different data types""" 

280 csv_content = """id,name,email,department,salary,is_active 

281,John Doe,john.doe@company.com,Engineering,75000,true 

2821, ,jane.smith@company.com,Marketing,65000,false 

2832,Jane Smith,,Sales,65000,true 

2843,Bob Johnson,bob@company.com,,65000,false 

2854,Alice Williams,alice@company.com,Engineering,,true 

2865,Charlie Brown,charlie@company.com,Marketing,65000, 

2876,David Wilson,david@company.com,Engineering,30000,true 

2887,Eve Davis,eve@company.com,Marketing,200000,false""" 

289 

290 csv_file = os.path.join(temp_dir, "edge_cases.csv") 

291 with open(csv_file, "w") as f: 

292 f.write(csv_content) 

293 

294 result = validate_csv(csv_file, basic_schema) 

295 assert not result["is_valid"] 

296 assert len(result["errors"]) > 0 

297 # Should have errors for empty id, empty name, invalid email, missing department, etc. 

298 

299 def test_validate_csv_min_max_boundaries(self, temp_dir, basic_schema): 

300 """Test min/max constraints at exact boundaries""" 

301 csv_content = """id,name,email,department,salary,is_active 

3021,John Doe,john.doe@company.com,Engineering,30000,true 

3032,Jane Smith,jane.smith@company.com,Marketing,200000,false 

3043,Bob Johnson,bob@company.com,Sales,29999,true 

3054,Alice Williams,alice@company.com,Engineering,200001,false""" 

306 

307 csv_file = os.path.join(temp_dir, "boundaries.csv") 

308 with open(csv_file, "w") as f: 

309 f.write(csv_content) 

310 

311 result = validate_csv(csv_file, basic_schema) 

312 assert not result["is_valid"] 

313 # Should have errors for salary values outside min/max range 

314 salary_errors = [error for error in result["errors"] if error.get("column") == "salary"] 

315 assert len(salary_errors) >= 2 # At least 29999 and 200001 should fail 

316 

317 def test_validate_csv_large_file(self, temp_dir, basic_schema): 

318 """Test validation with a large CSV file (performance test)""" 

319 # Generate a CSV with 1000+ rows 

320 csv_content = "id,name,email,department,salary,is_active\n" 

321 for i in range(1000): 

322 csv_content += f"{i+1},User{i+1},user{i+1}@company.com,Engineering,75000,true\n" 

323 

324 csv_file = os.path.join(temp_dir, "large.csv") 

325 with open(csv_file, "w") as f: 

326 f.write(csv_content) 

327 

328 result = validate_csv(csv_file, basic_schema) 

329 assert result["is_valid"] 

330 assert len(result["errors"]) == 0 

331 

332 def test_validate_csv_with_quotes_and_commas(self, temp_dir, basic_schema): 

333 """Test CSV with quoted fields containing commas""" 

334 csv_content = '''id,name,email,department,salary,is_active 

3351,"Doe, John",john.doe@company.com,Engineering,75000,true 

3362,"Smith, Jane",jane.smith@company.com,"Marketing, Digital",65000,false''' 

337 

338 csv_file = os.path.join(temp_dir, "quoted.csv") 

339 with open(csv_file, "w") as f: 

340 f.write(csv_content) 

341 

342 result = validate_csv(csv_file, basic_schema) 

343 # Should fail because "Marketing, Digital" is not in the enum 

344 assert not result["is_valid"] 

345 enum_errors = [error for error in result["errors"] if error.get("error_type") == "EnumValidationError"] 

346 assert len(enum_errors) > 0 

347 

348 def test_validate_csv_unicode_characters(self, temp_dir, basic_schema): 

349 """Test CSV with unicode and special characters""" 

350 csv_content = """id,name,email,department,salary,is_active 

3511,José García,jose.garcia@company.com,Engineering,75000,true 

3522,François Müller,francois.muller@company.com,Marketing,65000,false 

3533,李小明,li.xiaoming@company.com,Sales,70000,true""" 

354 

355 csv_file = os.path.join(temp_dir, "unicode.csv") 

356 with open(csv_file, "w", encoding='utf-8') as f: 

357 f.write(csv_content) 

358 

359 result = validate_csv(csv_file, basic_schema) 

360 assert result["is_valid"] 

361 assert len(result["errors"]) == 0 

362 

363 def test_validate_schema_duplicate_field_names(self, temp_dir): 

364 """Test schema with duplicate field names""" 

365 schema = { 

366 "name": "Test Schema", 

367 "fields": [ 

368 {"name": "id", "type": "integer", "required": True}, 

369 {"name": "id", "type": "string", "required": True} # Duplicate name 

370 ] 

371 } 

372 

373 csv_content = """id,name 

3741,John""" 

375 csv_file = os.path.join(temp_dir, "test.csv") 

376 with open(csv_file, "w") as f: 

377 f.write(csv_content) 

378 

379 result = validate_csv(csv_file, schema) 

380 assert not result["is_valid"] 

381 assert "Field names must be unique" in str(result["errors"]) 

382 

383 def test_validate_schema_invalid_regex_pattern(self, temp_dir): 

384 """Test schema with invalid regex pattern""" 

385 schema = { 

386 "name": "Test Schema", 

387 "fields": [ 

388 {"name": "email", "type": "string", "pattern": "[invalid regex", "required": True} 

389 ] 

390 } 

391 

392 csv_content = """email 

393test@example.com""" 

394 csv_file = os.path.join(temp_dir, "test.csv") 

395 with open(csv_file, "w") as f: 

396 f.write(csv_content) 

397 

398 result = validate_csv(csv_file, schema) 

399 assert not result["is_valid"] 

400 # Should fail schema validation due to invalid regex 

401 

402 def test_validate_schema_min_max_on_string_boolean(self, temp_dir): 

403 """Test schema with min/max constraints on string/boolean fields""" 

404 schema = { 

405 "name": "Test Schema", 

406 "fields": [ 

407 {"name": "name", "type": "string", "min": 5, "required": True}, 

408 {"name": "active", "type": "boolean", "max": 10, "required": True} 

409 ] 

410 } 

411 

412 csv_content = """name,active 

413John,true""" 

414 csv_file = os.path.join(temp_dir, "test.csv") 

415 with open(csv_file, "w") as f: 

416 f.write(csv_content) 

417 

418 result = validate_csv(csv_file, schema) 

419 assert not result["is_valid"] 

420 # Should fail schema validation due to min/max on string/boolean 

421 

422 def test_validate_csv_nonexistent_file(self, basic_schema): 

423 """Test validation with non-existent CSV file""" 

424 result = validate_csv("nonexistent_file.csv", basic_schema) 

425 assert result == { 

426 "is_valid": False, 

427 "errors": [ 

428 { 

429 "error_type": "CSVFileError", 

430 "error_message": "CSV file not found: nonexistent_file.csv", 

431 "row": None, 

432 "column": None, 

433 "value": None, 

434 "details": {"file_path": "nonexistent_file.csv"}, 

435 } 

436 ], 

437 } 

438 

439 def test_validate_csv_whitespace_only_fields(self, temp_dir, basic_schema): 

440 """Test CSV with whitespace-only fields""" 

441 csv_content = """id,name,email,department,salary,is_active 

4421, ,john.doe@company.com,Engineering,75000,true 

4432,John Doe, ,Marketing,65000,false 

4443,John Doe,john.doe@company.com, ,75000,true""" 

445 

446 csv_file = os.path.join(temp_dir, "whitespace.csv") 

447 with open(csv_file, "w") as f: 

448 f.write(csv_content) 

449 

450 result = validate_csv(csv_file, basic_schema) 

451 assert not result["is_valid"] 

452 # Should have errors for whitespace-only fields 

453 

454 def test_validate_csv_number_formats(self, temp_dir, basic_schema): 

455 """Test various number formats""" 

456 csv_content = """id,name,email,department,salary,is_active 

4571,John Doe,john.doe@company.com,Engineering,75000.0,true 

4582,Jane Smith,jane.smith@company.com,Marketing,65000.50,false 

4593,Bob Johnson,bob@company.com,Sales,1e5,true 

4604,Alice Williams,alice@company.com,Engineering,inf,false""" 

461 

462 csv_file = os.path.join(temp_dir, "number_formats.csv") 

463 with open(csv_file, "w") as f: 

464 f.write(csv_content) 

465 

466 result = validate_csv(csv_file, basic_schema) 

467 assert not result["is_valid"] 

468 # Should have errors for invalid number formats like 'inf' 

469 

470 def test_validate_csv_boolean_case_variations(self, temp_dir, basic_schema): 

471 """Test boolean values with different cases""" 

472 csv_content = """id,name,email,department,salary,is_active 

4731,John Doe,john.doe@company.com,Engineering,75000,TRUE 

4742,Jane Smith,jane.smith@company.com,Marketing,65000,FALSE 

4753,Bob Johnson,bob@company.com,Sales,70000,True 

4764,Alice Williams,alice@company.com,Engineering,80000,False""" 

477 

478 csv_file = os.path.join(temp_dir, "boolean_cases.csv") 

479 with open(csv_file, "w") as f: 

480 f.write(csv_content) 

481 

482 result = validate_csv(csv_file, basic_schema) 

483 assert result["is_valid"] 

484 # Boolean validation is case-insensitive, so TRUE/FALSE/True/False should all be valid 

485 

486 def test_validate_csv_no_required_fields(self, temp_dir): 

487 """Test schema where no fields are required""" 

488 schema = { 

489 "name": "Test Schema", 

490 "fields": [ 

491 {"name": "id", "type": "integer", "required": False}, 

492 {"name": "name", "type": "string", "required": False} 

493 ] 

494 } 

495 

496 csv_content = """id,name 

4971,John""" 

498 csv_file = os.path.join(temp_dir, "no_required.csv") 

499 with open(csv_file, "w") as f: 

500 f.write(csv_content) 

501 

502 result = validate_csv(csv_file, schema) 

503 assert result["is_valid"] 

504 assert len(result["errors"]) == 0 

505 

506 def test_validate_csv_extra_columns(self, temp_dir, basic_schema): 

507 """Test CSV with extra columns not in schema""" 

508 csv_content = """id,name,email,department,salary,is_active,extra_column 

5091,John Doe,john.doe@company.com,Engineering,75000,true,extra_value 

5102,Jane Smith,jane.smith@company.com,Marketing,65000,false,another_extra""" 

511 

512 csv_file = os.path.join(temp_dir, "extra_columns.csv") 

513 with open(csv_file, "w") as f: 

514 f.write(csv_content) 

515 

516 result = validate_csv(csv_file, basic_schema) 

517 assert result["is_valid"] 

518 # Extra columns should be ignored, not cause validation errors 

519 

520 def test_validate_csv_error_message_clarity(self, temp_dir, basic_schema): 

521 """Test that error messages are clear and actionable""" 

522 csv_content = """id,name,email,department,salary,is_active 

523invalid-id,John Doe,invalid-email,InvalidDept,25000,maybe""" 

524 

525 csv_file = os.path.join(temp_dir, "error_clarity.csv") 

526 with open(csv_file, "w") as f: 

527 f.write(csv_content) 

528 

529 result = validate_csv(csv_file, basic_schema) 

530 assert not result["is_valid"] 

531 

532 # Check that error messages contain helpful information 

533 errors = result["errors"] 

534 error_types = [error.get("error_type") for error in errors] 

535 

536 # Should have specific error types that are actionable 

537 expected_error_types = ["TypeValidationError", "PatternValidationError", "EnumValidationError", "TypeValidationError", "RangeValidationError"] 

538 for expected_type in expected_error_types: 

539 assert any(error_type == expected_type for error_type in error_types), f"Missing expected error type: {expected_type}" 

540 

541 def test_validate_csv_empty_fields_with_required_constraints(self, temp_dir): 

542 """Test empty fields when they are required""" 

543 schema = { 

544 "name": "Test Schema", 

545 "fields": [ 

546 {"name": "id", "type": "integer", "required": True}, 

547 {"name": "name", "type": "string", "required": True}, 

548 {"name": "email", "type": "string", "required": False} # Not required 

549 ] 

550 } 

551 

552 csv_content = """id,name,email 

553,John Doe, 

5541,,jane@example.com""" 

555 

556 csv_file = os.path.join(temp_dir, "empty_required.csv") 

557 with open(csv_file, "w") as f: 

558 f.write(csv_content) 

559 

560 result = validate_csv(csv_file, schema) 

561 assert not result["is_valid"] 

562 # Should have errors for empty required fields 

563 

564 def test_validate_csv_very_long_strings(self, temp_dir, basic_schema): 

565 """Test CSV with very long string values""" 

566 long_name = "A" * 1000 # Very long name 

567 csv_content = f"""id,name,email,department,salary,is_active 

5681,{long_name},john.doe@company.com,Engineering,75000,true""" 

569 

570 csv_file = os.path.join(temp_dir, "long_strings.csv") 

571 with open(csv_file, "w") as f: 

572 f.write(csv_content) 

573 

574 result = validate_csv(csv_file, basic_schema) 

575 assert result["is_valid"] 

576 # Long strings should be handled gracefully 

577 

578 def test_validate_csv_scientific_notation(self, temp_dir, basic_schema): 

579 """Test CSV with scientific notation in numeric fields""" 

580 csv_content = """id,name,email,department,salary,is_active 

5811,John Doe,john.doe@company.com,Engineering,7.5e4,true 

5822,Jane Smith,jane.smith@company.com,Marketing,6.5e4,false""" 

583 

584 csv_file = os.path.join(temp_dir, "scientific.csv") 

585 with open(csv_file, "w") as f: 

586 f.write(csv_content) 

587 

588 result = validate_csv(csv_file, basic_schema) 

589 assert result["is_valid"] 

590 # Scientific notation should be handled correctly 

591 

592 def test_validate_csv_negative_numbers(self, temp_dir, basic_schema): 

593 """Test CSV with negative numbers""" 

594 csv_content = """id,name,email,department,salary,is_active 

5951,John Doe,john.doe@company.com,Engineering,-75000,true""" 

596 

597 csv_file = os.path.join(temp_dir, "negative.csv") 

598 with open(csv_file, "w") as f: 

599 f.write(csv_content) 

600 

601 result = validate_csv(csv_file, basic_schema) 

602 assert not result["is_valid"] 

603 # Negative salary should fail min constraint 

604 

605 def test_validate_csv_zero_values(self, temp_dir, basic_schema): 

606 """Test CSV with zero values""" 

607 csv_content = """id,name,email,department,salary,is_active 

6080,John Doe,john.doe@company.com,Engineering,0,true""" 

609 

610 csv_file = os.path.join(temp_dir, "zero.csv") 

611 with open(csv_file, "w") as f: 

612 f.write(csv_content) 

613 

614 result = validate_csv(csv_file, basic_schema) 

615 assert not result["is_valid"] 

616 # Zero salary should fail min constraint