Coverage for /var/devmt/py/utils4_1.6.0/utils4/reader.py: 100%
42 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-13 09:50 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-13 09:50 +0000
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""
4:Purpose: This module contains purpose-built data readers for formats
5 which are no longer supported, namely:
7 - **.xls:** pre-Excel 5.0/95 Workbook
9:Platform: Linux/Windows | Python 3.7+
10:Developer: J Berendt
11:Email: development@s3dev.uk
13:Comments: n/a
15:Example:
17 Example for reading an old-style .xls (pre-Excel 5.0/95) Workbook into a
18 DataFrame::
20 >>> from utils4.reader import reader
21 >>> df = reader.read_xls('/path/to/file.xls')
23"""
24# pylint: disable=invalid-name
25# pylint: disable=wrong-import-order
27import io
28import os
29import pandas as pd
30import xlrd
31from datetime import datetime as dt
34class Reader:
35 """Class wrapper for various data reading methods.
37 For details on each reader, refer to the docstring for that reader.
39 """
41 def read_xls(self,
42 filepath: str,
43 encoding: str=None,
44 sheet_index: int=0,
45 skiprows: int=0,
46 skipcols: int=0,
47 chopcols: int=0,
48 date_formats: dict=None,
49 errors: str='coerce',
50 fill_date_errors: bool=False) -> pd.DataFrame:
51 """Read a pre-Excel 5.0/95 .XLS file into a DataFrame.
53 This function is designed to deal with *old* XLS files which
54 the ``pandas.read_excel`` function *does not support*.
56 Args:
57 filepath (str): Full path to the file to be read.
58 encoding (str, optional): Encoding used to read the XLS file.
59 Defaults to None.
60 sheet_index (int, optional): Index of the sheet to be read,
61 zero-based. Defaults to 0.
62 skiprows (int, optional): Number of rows to skip (from the
63 beginning of the file). Defaults to 0.
64 skipcols (int, optional): Number of columns to skip (from the left).
65 Defaults to 0.
66 chopcols (int, optional): Number of columns to skip/chop (from the
67 right). Defaults to 0.
68 date_formats (dict, optional): Dictionary of
69 ``{col_name: strftime_mask}``. Defaults to None.
70 errors (str, optional): Method used by :func:`~pandas.read_csv` to
71 resolve date parsing errors. Defaults to 'coerce'.
72 fill_date_errors (bool, optional): Fill coerced NaT date errors
73 with '1900-01-01'. Defaults to False.
75 :Logic:
76 The passed XLS file is opened and parsed by the ``xlrd`` library,
77 then read into an in-memory stream buffer, which is
78 passed into ``pandas.read_csv`` function for conversion to a
79 DataFrame.
81 Raises:
82 ValueError: If the file extension is not ``.xls``.
83 IOError: If the workbook does not contain any rows of data.
85 Returns:
86 df (pd.DataFrame): A DataFrame containing the contents of
87 the XLS file.
89 """
90 if os.path.splitext(filepath)[1].lower() != '.xls':
91 raise ValueError('The file *must* be an XLS file.')
92 chopcols = -chopcols if chopcols else None
93 stream = io.StringIO(newline='\n')
94 wb = xlrd.open_workbook(filepath, encoding_override=encoding, formatting_info=True)
95 ws = wb.sheet_by_index(sheet_index)
96 if not ws.nrows:
97 raise IOError('This workbook does not contain any rows of data.')
98 rows = ws.get_rows()
99 if skiprows:
100 for _ in range(skiprows):
101 next(rows)
102 for r in rows:
103 row = r[skipcols:chopcols]
104 # Ensure xldate formats are parsed correctly.
105 data = self._extract_row(row=row)
106 stream.write(data + '\n')
107 _ = stream.seek(0)
108 df = pd.read_csv(stream)
109 if date_formats:
110 for col, fmt in date_formats.items():
111 df[col] = pd.to_datetime(df[col], format=fmt, errors=errors)
112 if fill_date_errors:
113 # Changed to remove inplace=True due to pandas v3.0 deprecation warnings.
114 df[col] = df[col].fillna(dt(1900,1,1))
115 stream.close()
116 return df
118 @staticmethod
119 def _extract_row(row: iter) -> str:
120 """Extract and parse each row.
122 Args:
123 row (iter): Iterable object which is converted into a string,
124 separated by the separator specified by the ``sep`` argument.
125 sep (str, optional): Separator character. Defaults to ``','``.
127 Returns:
128 str: A string containing all row values, separated by the ``sep``
129 character.
131 """
132 def _value_generator(row: iter) -> str:
133 """Parse each row value based on its ``xf_index`` value.
135 Args:
136 row (iter): Iterable object.
138 Yields:
139 str: Each parsed value from the iterable.
141 """
142 for i in row:
143 if i.xf_index == 62:
144 val = xlrd.xldate.xldate_as_datetime(i.value, 0)
145 else:
146 val = i.value
147 yield str(val)
148 return ','.join(_value_generator(row=row))
151reader = Reader()