Coverage for src/csv_schema_validator/tests/test_validate_csv.py: 98%
221 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-12-23 15:34 +0100
« prev ^ index » next coverage.py v7.10.6, created at 2025-12-23 15:34 +0100
1import pytest
2import tempfile
3import os
5from csv_schema_validator.validate_csv import validate_csv
8class TestValidateCSV:
9 """Test suite for the validate_csv function"""
11 @pytest.fixture
12 def temp_dir(self):
13 """Create a temporary directory for test files"""
14 with tempfile.TemporaryDirectory() as tmpdir:
15 yield tmpdir
17 @pytest.fixture
18 def basic_schema(self):
19 """Create a basic schema file for testing"""
20 return {
21 "name": "Test Schema",
22 "description": "Basic test schema",
23 "fields": [
24 {
25 "name": "id",
26 "type": "integer",
27 "required": True,
28 "description": "Unique identifier",
29 },
30 {
31 "name": "name",
32 "type": "string",
33 "required": True,
34 "description": "Name field",
35 },
36 {
37 "name": "email",
38 "type": "string",
39 "required": True,
40 "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
41 "description": "Email address",
42 },
43 {
44 "name": "department",
45 "type": "string",
46 "required": True,
47 "enum": ["Engineering", "Marketing", "Sales"],
48 "description": "Department",
49 },
50 {
51 "name": "salary",
52 "type": "number",
53 "required": True,
54 "min": 30000,
55 "max": 200000,
56 "description": "Salary",
57 },
58 {
59 "name": "is_active",
60 "type": "boolean",
61 "required": True,
62 "description": "Active status",
63 },
64 ],
65 }
67 @pytest.fixture
68 def invalid_schema(self):
69 """Create an invalid schema file for testing"""
70 return {
71 "name": "Invalid Schema",
72 "description": "Invalid schema",
73 "fields": [
74 {
75 "name": "id",
76 "required": True,
77 },
78 ],
79 "invalid_field": "invalid_field",
80 }
82 @pytest.fixture
83 def non_json_non_text_schema(self):
84 """Create a non JSON non text schema file for testing"""
85 return 123
87 @pytest.fixture
88 def non_json_text_schema(self):
89 """Create a non JSON schema file for testing"""
90 return "invalid schema"
92 @pytest.fixture
93 def valid_csv(self, temp_dir):
94 """Create a valid CSV file for testing"""
95 csv_content = """id,name,email,department,salary,is_active
961,John Doe,john.doe@company.com,Engineering,75000,true
972,Jane Smith,jane.smith@company.com,Marketing,65000,false"""
99 csv_file = os.path.join(temp_dir, "valid.csv")
100 with open(csv_file, "w") as f:
101 f.write(csv_content)
102 return csv_file
104 @pytest.fixture
105 def invalid_csv(self, temp_dir):
106 """Create an invalid CSV file for testing"""
107 csv_content = """id,name,email,department,salary,is_active
1081,John Doe,invalid-email,Engineering,75000,true
1092,Jane Smith,jane.smith@company.com,InvalidDept,65000,false
1103,invalid-id,Bob Johnson,bob@company.com,Sales,25000,maybe
1114,Alice Williams,alice@company.com,Marketing,300000,true"""
112 csv_file = os.path.join(temp_dir, "invalid.csv")
113 with open(csv_file, "w") as f:
114 f.write(csv_content)
115 return csv_file
117 @pytest.fixture
118 def non_matching_csv(self, temp_dir):
119 """Create a non matching CSV file for testing"""
120 csv_content = """id,namee,email,department,salary,is_active
1211,John Doe,john.doe@company,Engineerin,75000,yes
122two,Jane Smith,jane.smith@company.com,Marketing,65000,false"""
123 csv_file = os.path.join(temp_dir, "non_matching.csv")
124 with open(csv_file, "w") as f:
125 f.write(csv_content)
126 return csv_file
128 def test_validate_csv_empty_file(self, temp_dir, basic_schema):
129 """Test validation of empty CSV file"""
130 empty_csv = os.path.join(temp_dir, "empty.csv")
131 with open(empty_csv, "w") as f:
132 f.write("")
134 result = validate_csv(empty_csv, basic_schema)
135 assert result == {
136 "is_valid": False,
137 "errors": [
138 {
139 "error_type": "EmptyFileError",
140 "error_message": "Csv file is empty",
141 "row": None,
142 "column": None,
143 "value": None,
144 "details": {"file_type": "CSV", "file_path": empty_csv},
145 }
146 ],
147 }
149 def test_validate_invalid_schema(self, valid_csv, invalid_schema):
150 result = validate_csv(valid_csv, invalid_schema)
151 assert result == {
152 "is_valid": False,
153 "errors": [
154 {
155 "error_type": "SchemaValidationError",
156 "error_message": "Field required",
157 "row": None,
158 "column": None,
159 "value": None,
160 "details": {
161 "input": {"name": "id", "required": True},
162 "type": "missing"
163 },
164 }
165 ],
166 "validated_schema": None,
167 }
169 def test_non_json_text_schema(self, valid_csv, non_json_text_schema):
170 result = validate_csv(valid_csv, non_json_text_schema)
171 assert result == {
172 "is_valid": False,
173 "errors": [
174 {
175 "error_type": "SchemaValidationError",
176 "error_message": "csv_schema_validator.core.models.CSVSchema() argument after ** must be a mapping, not str",
177 "row": None,
178 "column": None,
179 "value": None,
180 "details": {"schema_dict": "invalid schema"},
181 }
182 ],
183 "validated_schema": None,
184 }
186 def test_non_json_non_text_schema(self, valid_csv, non_json_non_text_schema):
187 result = validate_csv(valid_csv, non_json_non_text_schema)
188 assert result == {
189 "is_valid": False,
190 "errors": [
191 {
192 "error_type": "SchemaValidationError",
193 "error_message": "csv_schema_validator.core.models.CSVSchema() argument after ** must be a mapping, not int",
194 "row": None,
195 "column": None,
196 "value": None,
197 "details": {"schema_dict": 123},
198 }
199 ],
200 "validated_schema": None,
201 }
203 def test_validate_valid_csv(self, valid_csv, basic_schema):
204 result = validate_csv(valid_csv, basic_schema)
205 assert result == {
206 "is_valid": True,
207 "errors": [],
208 }
210 def test_validate_non_matching_csv(self, non_matching_csv, basic_schema):
211 result = validate_csv(non_matching_csv, basic_schema)
212 assert result == {
213 "is_valid": False,
214 "errors": [
215 {
216 "error_type": "RequiredFieldError",
217 "error_message": "Missing required fields: name",
218 "row": -1,
219 "column": "",
220 "value": None,
221 "details": {
222 "required_fields": [
223 "id",
224 "name",
225 "email",
226 "department",
227 "salary",
228 "is_active",
229 ],
230 "missing_fields": ["name"],
231 "available_fields": [
232 "id",
233 "namee",
234 "email",
235 "department",
236 "salary",
237 "is_active",
238 ],
239 },
240 },
241 {
242 "error_type": "PatternValidationError",
243 "error_message": "Field 'email' does not match required pattern",
244 "value": "john.doe@company",
245 "column": "email",
246 "row": 2,
247 "details": {
248 "expected_pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
249 },
250 },
251 {
252 "error_type": "EnumValidationError",
253 "error_message": "Field 'department' value 'Engineerin' is not in allowed values",
254 "value": "Engineerin",
255 "column": "department",
256 "row": 2,
257 "details": {"allowed_values": ["Engineering", "Marketing", "Sales"]},
258 },
259 {
260 "error_type": "TypeValidationError",
261 "error_message": "Invalid type for field 'is_active': expected boolean, got str",
262 "value": "yes",
263 "column": "is_active",
264 "row": 2,
265 "details": {"supported_values": ["true", "false"]},
266 },
267 {
268 "error_type": "TypeValidationError",
269 "error_message": "Invalid type for field 'id': expected integer, got str",
270 "value": "two",
271 "column": "id",
272 "row": 3,
273 "details": {"expected_type": "integer", "actual_type": "str"},
274 },
275 ],
276 }
278 def test_validate_csv_edge_case_data_types(self, temp_dir, basic_schema):
279 """Test edge cases for different data types"""
280 csv_content = """id,name,email,department,salary,is_active
281,John Doe,john.doe@company.com,Engineering,75000,true
2821, ,jane.smith@company.com,Marketing,65000,false
2832,Jane Smith,,Sales,65000,true
2843,Bob Johnson,bob@company.com,,65000,false
2854,Alice Williams,alice@company.com,Engineering,,true
2865,Charlie Brown,charlie@company.com,Marketing,65000,
2876,David Wilson,david@company.com,Engineering,30000,true
2887,Eve Davis,eve@company.com,Marketing,200000,false"""
290 csv_file = os.path.join(temp_dir, "edge_cases.csv")
291 with open(csv_file, "w") as f:
292 f.write(csv_content)
294 result = validate_csv(csv_file, basic_schema)
295 assert not result["is_valid"]
296 assert len(result["errors"]) > 0
297 # Should have errors for empty id, empty name, invalid email, missing department, etc.
299 def test_validate_csv_min_max_boundaries(self, temp_dir, basic_schema):
300 """Test min/max constraints at exact boundaries"""
301 csv_content = """id,name,email,department,salary,is_active
3021,John Doe,john.doe@company.com,Engineering,30000,true
3032,Jane Smith,jane.smith@company.com,Marketing,200000,false
3043,Bob Johnson,bob@company.com,Sales,29999,true
3054,Alice Williams,alice@company.com,Engineering,200001,false"""
307 csv_file = os.path.join(temp_dir, "boundaries.csv")
308 with open(csv_file, "w") as f:
309 f.write(csv_content)
311 result = validate_csv(csv_file, basic_schema)
312 assert not result["is_valid"]
313 # Should have errors for salary values outside min/max range
314 salary_errors = [error for error in result["errors"] if error.get("column") == "salary"]
315 assert len(salary_errors) >= 2 # At least 29999 and 200001 should fail
317 def test_validate_csv_large_file(self, temp_dir, basic_schema):
318 """Test validation with a large CSV file (performance test)"""
319 # Generate a CSV with 1000+ rows
320 csv_content = "id,name,email,department,salary,is_active\n"
321 for i in range(1000):
322 csv_content += f"{i+1},User{i+1},user{i+1}@company.com,Engineering,75000,true\n"
324 csv_file = os.path.join(temp_dir, "large.csv")
325 with open(csv_file, "w") as f:
326 f.write(csv_content)
328 result = validate_csv(csv_file, basic_schema)
329 assert result["is_valid"]
330 assert len(result["errors"]) == 0
332 def test_validate_csv_with_quotes_and_commas(self, temp_dir, basic_schema):
333 """Test CSV with quoted fields containing commas"""
334 csv_content = '''id,name,email,department,salary,is_active
3351,"Doe, John",john.doe@company.com,Engineering,75000,true
3362,"Smith, Jane",jane.smith@company.com,"Marketing, Digital",65000,false'''
338 csv_file = os.path.join(temp_dir, "quoted.csv")
339 with open(csv_file, "w") as f:
340 f.write(csv_content)
342 result = validate_csv(csv_file, basic_schema)
343 # Should fail because "Marketing, Digital" is not in the enum
344 assert not result["is_valid"]
345 enum_errors = [error for error in result["errors"] if error.get("error_type") == "EnumValidationError"]
346 assert len(enum_errors) > 0
348 def test_validate_csv_unicode_characters(self, temp_dir, basic_schema):
349 """Test CSV with unicode and special characters"""
350 csv_content = """id,name,email,department,salary,is_active
3511,José García,jose.garcia@company.com,Engineering,75000,true
3522,François Müller,francois.muller@company.com,Marketing,65000,false
3533,李小明,li.xiaoming@company.com,Sales,70000,true"""
355 csv_file = os.path.join(temp_dir, "unicode.csv")
356 with open(csv_file, "w", encoding='utf-8') as f:
357 f.write(csv_content)
359 result = validate_csv(csv_file, basic_schema)
360 assert result["is_valid"]
361 assert len(result["errors"]) == 0
363 def test_validate_schema_duplicate_field_names(self, temp_dir):
364 """Test schema with duplicate field names"""
365 schema = {
366 "name": "Test Schema",
367 "fields": [
368 {"name": "id", "type": "integer", "required": True},
369 {"name": "id", "type": "string", "required": True} # Duplicate name
370 ]
371 }
373 csv_content = """id,name
3741,John"""
375 csv_file = os.path.join(temp_dir, "test.csv")
376 with open(csv_file, "w") as f:
377 f.write(csv_content)
379 result = validate_csv(csv_file, schema)
380 assert not result["is_valid"]
381 assert "Field names must be unique" in str(result["errors"])
383 def test_validate_schema_invalid_regex_pattern(self, temp_dir):
384 """Test schema with invalid regex pattern"""
385 schema = {
386 "name": "Test Schema",
387 "fields": [
388 {"name": "email", "type": "string", "pattern": "[invalid regex", "required": True}
389 ]
390 }
392 csv_content = """email
393test@example.com"""
394 csv_file = os.path.join(temp_dir, "test.csv")
395 with open(csv_file, "w") as f:
396 f.write(csv_content)
398 result = validate_csv(csv_file, schema)
399 assert not result["is_valid"]
400 # Should fail schema validation due to invalid regex
402 def test_validate_schema_min_max_on_string_boolean(self, temp_dir):
403 """Test schema with min/max constraints on string/boolean fields"""
404 schema = {
405 "name": "Test Schema",
406 "fields": [
407 {"name": "name", "type": "string", "min": 5, "required": True},
408 {"name": "active", "type": "boolean", "max": 10, "required": True}
409 ]
410 }
412 csv_content = """name,active
413John,true"""
414 csv_file = os.path.join(temp_dir, "test.csv")
415 with open(csv_file, "w") as f:
416 f.write(csv_content)
418 result = validate_csv(csv_file, schema)
419 assert not result["is_valid"]
420 # Should fail schema validation due to min/max on string/boolean
422 def test_validate_csv_nonexistent_file(self, basic_schema):
423 """Test validation with non-existent CSV file"""
424 result = validate_csv("nonexistent_file.csv", basic_schema)
425 assert result == {
426 "is_valid": False,
427 "errors": [
428 {
429 "error_type": "CSVFileError",
430 "error_message": "CSV file not found: nonexistent_file.csv",
431 "row": None,
432 "column": None,
433 "value": None,
434 "details": {"file_path": "nonexistent_file.csv"},
435 }
436 ],
437 }
439 def test_validate_csv_whitespace_only_fields(self, temp_dir, basic_schema):
440 """Test CSV with whitespace-only fields"""
441 csv_content = """id,name,email,department,salary,is_active
4421, ,john.doe@company.com,Engineering,75000,true
4432,John Doe, ,Marketing,65000,false
4443,John Doe,john.doe@company.com, ,75000,true"""
446 csv_file = os.path.join(temp_dir, "whitespace.csv")
447 with open(csv_file, "w") as f:
448 f.write(csv_content)
450 result = validate_csv(csv_file, basic_schema)
451 assert not result["is_valid"]
452 # Should have errors for whitespace-only fields
454 def test_validate_csv_number_formats(self, temp_dir, basic_schema):
455 """Test various number formats"""
456 csv_content = """id,name,email,department,salary,is_active
4571,John Doe,john.doe@company.com,Engineering,75000.0,true
4582,Jane Smith,jane.smith@company.com,Marketing,65000.50,false
4593,Bob Johnson,bob@company.com,Sales,1e5,true
4604,Alice Williams,alice@company.com,Engineering,inf,false"""
462 csv_file = os.path.join(temp_dir, "number_formats.csv")
463 with open(csv_file, "w") as f:
464 f.write(csv_content)
466 result = validate_csv(csv_file, basic_schema)
467 assert not result["is_valid"]
468 # Should have errors for invalid number formats like 'inf'
470 def test_validate_csv_boolean_case_variations(self, temp_dir, basic_schema):
471 """Test boolean values with different cases"""
472 csv_content = """id,name,email,department,salary,is_active
4731,John Doe,john.doe@company.com,Engineering,75000,TRUE
4742,Jane Smith,jane.smith@company.com,Marketing,65000,FALSE
4753,Bob Johnson,bob@company.com,Sales,70000,True
4764,Alice Williams,alice@company.com,Engineering,80000,False"""
478 csv_file = os.path.join(temp_dir, "boolean_cases.csv")
479 with open(csv_file, "w") as f:
480 f.write(csv_content)
482 result = validate_csv(csv_file, basic_schema)
483 assert result["is_valid"]
484 # Boolean validation is case-insensitive, so TRUE/FALSE/True/False should all be valid
486 def test_validate_csv_no_required_fields(self, temp_dir):
487 """Test schema where no fields are required"""
488 schema = {
489 "name": "Test Schema",
490 "fields": [
491 {"name": "id", "type": "integer", "required": False},
492 {"name": "name", "type": "string", "required": False}
493 ]
494 }
496 csv_content = """id,name
4971,John"""
498 csv_file = os.path.join(temp_dir, "no_required.csv")
499 with open(csv_file, "w") as f:
500 f.write(csv_content)
502 result = validate_csv(csv_file, schema)
503 assert result["is_valid"]
504 assert len(result["errors"]) == 0
506 def test_validate_csv_extra_columns(self, temp_dir, basic_schema):
507 """Test CSV with extra columns not in schema"""
508 csv_content = """id,name,email,department,salary,is_active,extra_column
5091,John Doe,john.doe@company.com,Engineering,75000,true,extra_value
5102,Jane Smith,jane.smith@company.com,Marketing,65000,false,another_extra"""
512 csv_file = os.path.join(temp_dir, "extra_columns.csv")
513 with open(csv_file, "w") as f:
514 f.write(csv_content)
516 result = validate_csv(csv_file, basic_schema)
517 assert result["is_valid"]
518 # Extra columns should be ignored, not cause validation errors
520 def test_validate_csv_error_message_clarity(self, temp_dir, basic_schema):
521 """Test that error messages are clear and actionable"""
522 csv_content = """id,name,email,department,salary,is_active
523invalid-id,John Doe,invalid-email,InvalidDept,25000,maybe"""
525 csv_file = os.path.join(temp_dir, "error_clarity.csv")
526 with open(csv_file, "w") as f:
527 f.write(csv_content)
529 result = validate_csv(csv_file, basic_schema)
530 assert not result["is_valid"]
532 # Check that error messages contain helpful information
533 errors = result["errors"]
534 error_types = [error.get("error_type") for error in errors]
536 # Should have specific error types that are actionable
537 expected_error_types = ["TypeValidationError", "PatternValidationError", "EnumValidationError", "TypeValidationError", "RangeValidationError"]
538 for expected_type in expected_error_types:
539 assert any(error_type == expected_type for error_type in error_types), f"Missing expected error type: {expected_type}"
541 def test_validate_csv_empty_fields_with_required_constraints(self, temp_dir):
542 """Test empty fields when they are required"""
543 schema = {
544 "name": "Test Schema",
545 "fields": [
546 {"name": "id", "type": "integer", "required": True},
547 {"name": "name", "type": "string", "required": True},
548 {"name": "email", "type": "string", "required": False} # Not required
549 ]
550 }
552 csv_content = """id,name,email
553,John Doe,
5541,,jane@example.com"""
556 csv_file = os.path.join(temp_dir, "empty_required.csv")
557 with open(csv_file, "w") as f:
558 f.write(csv_content)
560 result = validate_csv(csv_file, schema)
561 assert not result["is_valid"]
562 # Should have errors for empty required fields
564 def test_validate_csv_very_long_strings(self, temp_dir, basic_schema):
565 """Test CSV with very long string values"""
566 long_name = "A" * 1000 # Very long name
567 csv_content = f"""id,name,email,department,salary,is_active
5681,{long_name},john.doe@company.com,Engineering,75000,true"""
570 csv_file = os.path.join(temp_dir, "long_strings.csv")
571 with open(csv_file, "w") as f:
572 f.write(csv_content)
574 result = validate_csv(csv_file, basic_schema)
575 assert result["is_valid"]
576 # Long strings should be handled gracefully
578 def test_validate_csv_scientific_notation(self, temp_dir, basic_schema):
579 """Test CSV with scientific notation in numeric fields"""
580 csv_content = """id,name,email,department,salary,is_active
5811,John Doe,john.doe@company.com,Engineering,7.5e4,true
5822,Jane Smith,jane.smith@company.com,Marketing,6.5e4,false"""
584 csv_file = os.path.join(temp_dir, "scientific.csv")
585 with open(csv_file, "w") as f:
586 f.write(csv_content)
588 result = validate_csv(csv_file, basic_schema)
589 assert result["is_valid"]
590 # Scientific notation should be handled correctly
592 def test_validate_csv_negative_numbers(self, temp_dir, basic_schema):
593 """Test CSV with negative numbers"""
594 csv_content = """id,name,email,department,salary,is_active
5951,John Doe,john.doe@company.com,Engineering,-75000,true"""
597 csv_file = os.path.join(temp_dir, "negative.csv")
598 with open(csv_file, "w") as f:
599 f.write(csv_content)
601 result = validate_csv(csv_file, basic_schema)
602 assert not result["is_valid"]
603 # Negative salary should fail min constraint
605 def test_validate_csv_zero_values(self, temp_dir, basic_schema):
606 """Test CSV with zero values"""
607 csv_content = """id,name,email,department,salary,is_active
6080,John Doe,john.doe@company.com,Engineering,0,true"""
610 csv_file = os.path.join(temp_dir, "zero.csv")
611 with open(csv_file, "w") as f:
612 f.write(csv_content)
614 result = validate_csv(csv_file, basic_schema)
615 assert not result["is_valid"]
616 # Zero salary should fail min constraint