Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/excel/_base.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import abc
2from datetime import date, datetime, timedelta
3from io import BytesIO
4import os
5from textwrap import fill
7from pandas._config import config
9from pandas._libs.parsers import STR_NA_VALUES
10from pandas.errors import EmptyDataError
11from pandas.util._decorators import Appender
13from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
15from pandas.core.frame import DataFrame
17from pandas.io.common import (
18 get_filepath_or_buffer,
19 is_url,
20 stringify_path,
21 urlopen,
22 validate_header_arg,
23)
24from pandas.io.excel._util import (
25 _fill_mi_header,
26 _get_default_writer,
27 _maybe_convert_usecols,
28 _pop_header_name,
29 get_writer,
30)
31from pandas.io.formats.printing import pprint_thing
32from pandas.io.parsers import TextParser
34_read_excel_doc = (
35 """
36Read an Excel file into a pandas DataFrame.
38Supports `xls`, `xlsx`, `xlsm`, `xlsb`, and `odf` file extensions
39read from a local filesystem or URL. Supports an option to read
40a single sheet or a list of sheets.
42Parameters
43----------
44io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
45 Any valid string path is acceptable. The string could be a URL. Valid
46 URL schemes include http, ftp, s3, and file. For file URLs, a host is
47 expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
49 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
51 By file-like object, we refer to objects with a ``read()`` method,
52 such as a file handler (e.g. via builtin ``open`` function)
53 or ``StringIO``.
54sheet_name : str, int, list, or None, default 0
55 Strings are used for sheet names. Integers are used in zero-indexed
56 sheet positions. Lists of strings/integers are used to request
57 multiple sheets. Specify None to get all sheets.
59 Available cases:
61 * Defaults to ``0``: 1st sheet as a `DataFrame`
62 * ``1``: 2nd sheet as a `DataFrame`
63 * ``"Sheet1"``: Load sheet with name "Sheet1"
64 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
65 as a dict of `DataFrame`
66 * None: All sheets.
68header : int, list of int, default 0
69 Row (0-indexed) to use for the column labels of the parsed
70 DataFrame. If a list of integers is passed those row positions will
71 be combined into a ``MultiIndex``. Use None if there is no header.
72names : array-like, default None
73 List of column names to use. If file contains no header row,
74 then you should explicitly pass header=None.
75index_col : int, list of int, default None
76 Column (0-indexed) to use as the row labels of the DataFrame.
77 Pass None if there is no such column. If a list is passed,
78 those columns will be combined into a ``MultiIndex``. If a
79 subset of data is selected with ``usecols``, index_col
80 is based on the subset.
81usecols : int, str, list-like, or callable default None
82 * If None, then parse all columns.
83 * If str, then indicates comma separated list of Excel column letters
84 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
85 both sides.
86 * If list of int, then indicates list of column numbers to be parsed.
87 * If list of string, then indicates list of column names to be parsed.
89 .. versionadded:: 0.24.0
91 * If callable, then evaluate each column name against it and parse the
92 column if the callable returns ``True``.
94 Returns a subset of the columns according to behavior above.
96 .. versionadded:: 0.24.0
98squeeze : bool, default False
99 If the parsed data only contains one column then return a Series.
100dtype : Type name or dict of column -> type, default None
101 Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
102 Use `object` to preserve data as stored in Excel and not interpret dtype.
103 If converters are specified, they will be applied INSTEAD
104 of dtype conversion.
105engine : str, default None
106 If io is not a buffer or path, this must be set to identify io.
107 Acceptable values are None, "xlrd", "openpyxl" or "odf".
108converters : dict, default None
109 Dict of functions for converting values in certain columns. Keys can
110 either be integers or column labels, values are functions that take one
111 input argument, the Excel cell content, and return the transformed
112 content.
113true_values : list, default None
114 Values to consider as True.
115false_values : list, default None
116 Values to consider as False.
117skiprows : list-like
118 Rows to skip at the beginning (0-indexed).
119nrows : int, default None
120 Number of rows to parse.
122 .. versionadded:: 0.23.0
124na_values : scalar, str, list-like, or dict, default None
125 Additional strings to recognize as NA/NaN. If dict passed, specific
126 per-column NA values. By default the following values are interpreted
127 as NaN: '"""
128 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
129 + """'.
130keep_default_na : bool, default True
131 Whether or not to include the default NaN values when parsing the data.
132 Depending on whether `na_values` is passed in, the behavior is as follows:
134 * If `keep_default_na` is True, and `na_values` are specified, `na_values`
135 is appended to the default NaN values used for parsing.
136 * If `keep_default_na` is True, and `na_values` are not specified, only
137 the default NaN values are used for parsing.
138 * If `keep_default_na` is False, and `na_values` are specified, only
139 the NaN values specified `na_values` are used for parsing.
140 * If `keep_default_na` is False, and `na_values` are not specified, no
141 strings will be parsed as NaN.
143 Note that if `na_filter` is passed in as False, the `keep_default_na` and
144 `na_values` parameters will be ignored.
145na_filter : bool, default True
146 Detect missing value markers (empty strings and the value of na_values). In
147 data without any NAs, passing na_filter=False can improve the performance
148 of reading a large file.
149verbose : bool, default False
150 Indicate number of NA values placed in non-numeric columns.
151parse_dates : bool, list-like, or dict, default False
152 The behavior is as follows:
154 * bool. If True -> try parsing the index.
155 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
156 each as a separate date column.
157 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
158 a single date column.
159 * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
160 result 'foo'
162 If a column or index contains an unparseable date, the entire column or
163 index will be returned unaltered as an object data type. If you don`t want to
164 parse some cells as date just change their type in Excel to "Text".
165 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
167 Note: A fast-path exists for iso8601-formatted dates.
168date_parser : function, optional
169 Function to use for converting a sequence of string columns to an array of
170 datetime instances. The default uses ``dateutil.parser.parser`` to do the
171 conversion. Pandas will try to call `date_parser` in three different ways,
172 advancing to the next if an exception occurs: 1) Pass one or more arrays
173 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
174 string values from the columns defined by `parse_dates` into a single array
175 and pass that; and 3) call `date_parser` once for each row using one or
176 more strings (corresponding to the columns defined by `parse_dates`) as
177 arguments.
178thousands : str, default None
179 Thousands separator for parsing string columns to numeric. Note that
180 this parameter is only necessary for columns stored as TEXT in Excel,
181 any numeric columns will automatically be parsed, regardless of display
182 format.
183comment : str, default None
184 Comments out remainder of line. Pass a character or characters to this
185 argument to indicate comments in the input file. Any data between the
186 comment string and the end of the current line is ignored.
187skipfooter : int, default 0
188 Rows at the end to skip (0-indexed).
189convert_float : bool, default True
190 Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
191 data will be read in as floats: Excel stores all numbers as floats
192 internally.
193mangle_dupe_cols : bool, default True
194 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
195 'X'...'X'. Passing in False will cause data to be overwritten if there
196 are duplicate names in the columns.
197**kwds : optional
198 Optional keyword arguments can be passed to ``TextFileReader``.
200Returns
201-------
202DataFrame or dict of DataFrames
203 DataFrame from the passed in Excel file. See notes in sheet_name
204 argument for more information on when a dict of DataFrames is returned.
206See Also
207--------
208to_excel : Write DataFrame to an Excel file.
209to_csv : Write DataFrame to a comma-separated values (csv) file.
210read_csv : Read a comma-separated values (csv) file into DataFrame.
211read_fwf : Read a table of fixed-width formatted lines into DataFrame.
213Examples
214--------
215The file can be read using the file name as string or an open file object:
217>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
218 Name Value
2190 string1 1
2201 string2 2
2212 #Comment 3
223>>> pd.read_excel(open('tmp.xlsx', 'rb'),
224... sheet_name='Sheet3') # doctest: +SKIP
225 Unnamed: 0 Name Value
2260 0 string1 1
2271 1 string2 2
2282 2 #Comment 3
230Index and header can be specified via the `index_col` and `header` arguments
232>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
233 0 1 2
2340 NaN Name Value
2351 0.0 string1 1
2362 1.0 string2 2
2373 2.0 #Comment 3
239Column types are inferred but can be explicitly specified
241>>> pd.read_excel('tmp.xlsx', index_col=0,
242... dtype={'Name': str, 'Value': float}) # doctest: +SKIP
243 Name Value
2440 string1 1.0
2451 string2 2.0
2462 #Comment 3.0
248True, False, and NA values, and thousands separators have defaults,
249but can be explicitly specified, too. Supply the values you would like
250as strings or lists of strings!
252>>> pd.read_excel('tmp.xlsx', index_col=0,
253... na_values=['string1', 'string2']) # doctest: +SKIP
254 Name Value
2550 NaN 1
2561 NaN 2
2572 #Comment 3
259Comment lines in the excel input file can be skipped using the `comment` kwarg
261>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
262 Name Value
2630 string1 1.0
2641 string2 2.0
2652 None NaN
266"""
267)
270@Appender(_read_excel_doc)
271def read_excel(
272 io,
273 sheet_name=0,
274 header=0,
275 names=None,
276 index_col=None,
277 usecols=None,
278 squeeze=False,
279 dtype=None,
280 engine=None,
281 converters=None,
282 true_values=None,
283 false_values=None,
284 skiprows=None,
285 nrows=None,
286 na_values=None,
287 keep_default_na=True,
288 verbose=False,
289 parse_dates=False,
290 date_parser=None,
291 thousands=None,
292 comment=None,
293 skipfooter=0,
294 convert_float=True,
295 mangle_dupe_cols=True,
296 **kwds,
297):
299 for arg in ("sheet", "sheetname", "parse_cols"):
300 if arg in kwds:
301 raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`")
303 if not isinstance(io, ExcelFile):
304 io = ExcelFile(io, engine=engine)
305 elif engine and engine != io.engine:
306 raise ValueError(
307 "Engine should not be specified when passing "
308 "an ExcelFile - ExcelFile already has the engine set"
309 )
311 return io.parse(
312 sheet_name=sheet_name,
313 header=header,
314 names=names,
315 index_col=index_col,
316 usecols=usecols,
317 squeeze=squeeze,
318 dtype=dtype,
319 converters=converters,
320 true_values=true_values,
321 false_values=false_values,
322 skiprows=skiprows,
323 nrows=nrows,
324 na_values=na_values,
325 keep_default_na=keep_default_na,
326 verbose=verbose,
327 parse_dates=parse_dates,
328 date_parser=date_parser,
329 thousands=thousands,
330 comment=comment,
331 skipfooter=skipfooter,
332 convert_float=convert_float,
333 mangle_dupe_cols=mangle_dupe_cols,
334 **kwds,
335 )
338class _BaseExcelReader(metaclass=abc.ABCMeta):
339 def __init__(self, filepath_or_buffer):
340 # If filepath_or_buffer is a url, load the data into a BytesIO
341 if is_url(filepath_or_buffer):
342 filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
343 elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
344 filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)
346 if isinstance(filepath_or_buffer, self._workbook_class):
347 self.book = filepath_or_buffer
348 elif hasattr(filepath_or_buffer, "read"):
349 # N.B. xlrd.Book has a read attribute too
350 filepath_or_buffer.seek(0)
351 self.book = self.load_workbook(filepath_or_buffer)
352 elif isinstance(filepath_or_buffer, str):
353 self.book = self.load_workbook(filepath_or_buffer)
354 elif isinstance(filepath_or_buffer, bytes):
355 self.book = self.load_workbook(BytesIO(filepath_or_buffer))
356 else:
357 raise ValueError(
358 "Must explicitly set engine if not passing in buffer or path for io."
359 )
361 @property
362 @abc.abstractmethod
363 def _workbook_class(self):
364 pass
366 @abc.abstractmethod
367 def load_workbook(self, filepath_or_buffer):
368 pass
370 def close(self):
371 pass
373 @property
374 @abc.abstractmethod
375 def sheet_names(self):
376 pass
378 @abc.abstractmethod
379 def get_sheet_by_name(self, name):
380 pass
382 @abc.abstractmethod
383 def get_sheet_by_index(self, index):
384 pass
386 @abc.abstractmethod
387 def get_sheet_data(self, sheet, convert_float):
388 pass
390 def parse(
391 self,
392 sheet_name=0,
393 header=0,
394 names=None,
395 index_col=None,
396 usecols=None,
397 squeeze=False,
398 dtype=None,
399 true_values=None,
400 false_values=None,
401 skiprows=None,
402 nrows=None,
403 na_values=None,
404 verbose=False,
405 parse_dates=False,
406 date_parser=None,
407 thousands=None,
408 comment=None,
409 skipfooter=0,
410 convert_float=True,
411 mangle_dupe_cols=True,
412 **kwds,
413 ):
415 validate_header_arg(header)
417 ret_dict = False
419 # Keep sheetname to maintain backwards compatibility.
420 if isinstance(sheet_name, list):
421 sheets = sheet_name
422 ret_dict = True
423 elif sheet_name is None:
424 sheets = self.sheet_names
425 ret_dict = True
426 else:
427 sheets = [sheet_name]
429 # handle same-type duplicates.
430 sheets = list(dict.fromkeys(sheets).keys())
432 output = {}
434 for asheetname in sheets:
435 if verbose:
436 print(f"Reading sheet {asheetname}")
438 if isinstance(asheetname, str):
439 sheet = self.get_sheet_by_name(asheetname)
440 else: # assume an integer if not a string
441 sheet = self.get_sheet_by_index(asheetname)
443 data = self.get_sheet_data(sheet, convert_float)
444 usecols = _maybe_convert_usecols(usecols)
446 if not data:
447 output[asheetname] = DataFrame()
448 continue
450 if is_list_like(header) and len(header) == 1:
451 header = header[0]
453 # forward fill and pull out names for MultiIndex column
454 header_names = None
455 if header is not None and is_list_like(header):
456 header_names = []
457 control_row = [True] * len(data[0])
459 for row in header:
460 if is_integer(skiprows):
461 row += skiprows
463 data[row], control_row = _fill_mi_header(data[row], control_row)
465 if index_col is not None:
466 header_name, _ = _pop_header_name(data[row], index_col)
467 header_names.append(header_name)
469 if is_list_like(index_col):
470 # Forward fill values for MultiIndex index.
471 if not is_list_like(header):
472 offset = 1 + header
473 else:
474 offset = 1 + max(header)
476 # Check if we have an empty dataset
477 # before trying to collect data.
478 if offset < len(data):
479 for col in index_col:
480 last = data[offset][col]
482 for row in range(offset + 1, len(data)):
483 if data[row][col] == "" or data[row][col] is None:
484 data[row][col] = last
485 else:
486 last = data[row][col]
488 has_index_names = is_list_like(header) and len(header) > 1
490 # GH 12292 : error when read one empty column from excel file
491 try:
492 parser = TextParser(
493 data,
494 names=names,
495 header=header,
496 index_col=index_col,
497 has_index_names=has_index_names,
498 squeeze=squeeze,
499 dtype=dtype,
500 true_values=true_values,
501 false_values=false_values,
502 skiprows=skiprows,
503 nrows=nrows,
504 na_values=na_values,
505 parse_dates=parse_dates,
506 date_parser=date_parser,
507 thousands=thousands,
508 comment=comment,
509 skipfooter=skipfooter,
510 usecols=usecols,
511 mangle_dupe_cols=mangle_dupe_cols,
512 **kwds,
513 )
515 output[asheetname] = parser.read(nrows=nrows)
517 if not squeeze or isinstance(output[asheetname], DataFrame):
518 if header_names:
519 output[asheetname].columns = output[
520 asheetname
521 ].columns.set_names(header_names)
523 except EmptyDataError:
524 # No Data, return an empty DataFrame
525 output[asheetname] = DataFrame()
527 if ret_dict:
528 return output
529 else:
530 return output[asheetname]
533class ExcelWriter(metaclass=abc.ABCMeta):
534 """
535 Class for writing DataFrame objects into excel sheets.
537 Default is to use xlwt for xls, openpyxl for xlsx.
538 See DataFrame.to_excel for typical usage.
540 Parameters
541 ----------
542 path : str
543 Path to xls or xlsx file.
544 engine : str (optional)
545 Engine to use for writing. If None, defaults to
546 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
547 argument.
548 date_format : str, default None
549 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
550 datetime_format : str, default None
551 Format string for datetime objects written into Excel files.
552 (e.g. 'YYYY-MM-DD HH:MM:SS').
553 mode : {'w', 'a'}, default 'w'
554 File mode to use (write or append).
556 .. versionadded:: 0.24.0
558 Attributes
559 ----------
560 None
562 Methods
563 -------
564 None
566 Notes
567 -----
568 None of the methods and properties are considered public.
570 For compatibility with CSV writers, ExcelWriter serializes lists
571 and dicts to strings before writing.
573 Examples
574 --------
575 Default usage:
577 >>> with ExcelWriter('path_to_file.xlsx') as writer:
578 ... df.to_excel(writer)
580 To write to separate sheets in a single file:
582 >>> with ExcelWriter('path_to_file.xlsx') as writer:
583 ... df1.to_excel(writer, sheet_name='Sheet1')
584 ... df2.to_excel(writer, sheet_name='Sheet2')
586 You can set the date format or datetime format:
588 >>> with ExcelWriter('path_to_file.xlsx',
589 date_format='YYYY-MM-DD',
590 datetime_format='YYYY-MM-DD HH:MM:SS') as writer:
591 ... df.to_excel(writer)
593 You can also append to an existing Excel file:
595 >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer:
596 ... df.to_excel(writer, sheet_name='Sheet3')
597 """
599 # Defining an ExcelWriter implementation (see abstract methods for more...)
601 # - Mandatory
602 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
603 # --> called to write additional DataFrames to disk
604 # - ``supported_extensions`` (tuple of supported extensions), used to
605 # check that engine supports the given extension.
606 # - ``engine`` - string that gives the engine name. Necessary to
607 # instantiate class directly and bypass ``ExcelWriterMeta`` engine
608 # lookup.
609 # - ``save(self)`` --> called to save file to disk
610 # - Mostly mandatory (i.e. should at least exist)
611 # - book, cur_sheet, path
613 # - Optional:
614 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called
615 # with path as first argument.
617 # You also need to register the class with ``register_writer()``.
618 # Technically, ExcelWriter implementations don't need to subclass
619 # ExcelWriter.
620 def __new__(cls, path, engine=None, **kwargs):
621 # only switch class if generic(ExcelWriter)
623 if cls is ExcelWriter:
624 if engine is None or (isinstance(engine, str) and engine == "auto"):
625 if isinstance(path, str):
626 ext = os.path.splitext(path)[-1][1:]
627 else:
628 ext = "xlsx"
630 try:
631 engine = config.get_option(f"io.excel.{ext}.writer")
632 if engine == "auto":
633 engine = _get_default_writer(ext)
634 except KeyError:
635 raise ValueError(f"No engine for filetype: '{ext}'")
636 cls = get_writer(engine)
638 return object.__new__(cls)
640 # declare external properties you can count on
641 book = None
642 curr_sheet = None
643 path = None
645 @property
646 @abc.abstractmethod
647 def supported_extensions(self):
648 """Extensions that writer engine supports."""
649 pass
651 @property
652 @abc.abstractmethod
653 def engine(self):
654 """Name of engine."""
655 pass
657 @abc.abstractmethod
658 def write_cells(
659 self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
660 ):
661 """
662 Write given formatted cells into Excel an excel sheet
664 Parameters
665 ----------
666 cells : generator
667 cell of formatted data to save to Excel sheet
668 sheet_name : str, default None
669 Name of Excel sheet, if None, then use self.cur_sheet
670 startrow : upper left cell row to dump data frame
671 startcol : upper left cell column to dump data frame
672 freeze_panes: int tuple of length 2
673 contains the bottom-most row and right-most column to freeze
674 """
675 pass
677 @abc.abstractmethod
678 def save(self):
679 """
680 Save workbook to disk.
681 """
682 pass
684 def __init__(
685 self,
686 path,
687 engine=None,
688 date_format=None,
689 datetime_format=None,
690 mode="w",
691 **engine_kwargs,
692 ):
693 # validate that this engine can handle the extension
694 if isinstance(path, str):
695 ext = os.path.splitext(path)[-1]
696 else:
697 ext = "xls" if engine == "xlwt" else "xlsx"
699 self.check_extension(ext)
701 self.path = path
702 self.sheets = {}
703 self.cur_sheet = None
705 if date_format is None:
706 self.date_format = "YYYY-MM-DD"
707 else:
708 self.date_format = date_format
709 if datetime_format is None:
710 self.datetime_format = "YYYY-MM-DD HH:MM:SS"
711 else:
712 self.datetime_format = datetime_format
714 self.mode = mode
716 def __fspath__(self):
717 return stringify_path(self.path)
719 def _get_sheet_name(self, sheet_name):
720 if sheet_name is None:
721 sheet_name = self.cur_sheet
722 if sheet_name is None: # pragma: no cover
723 raise ValueError("Must pass explicit sheet_name or set cur_sheet property")
724 return sheet_name
726 def _value_with_fmt(self, val):
727 """Convert numpy types to Python types for the Excel writers.
729 Parameters
730 ----------
731 val : object
732 Value to be written into cells
734 Returns
735 -------
736 Tuple with the first element being the converted value and the second
737 being an optional format
738 """
739 fmt = None
741 if is_integer(val):
742 val = int(val)
743 elif is_float(val):
744 val = float(val)
745 elif is_bool(val):
746 val = bool(val)
747 elif isinstance(val, datetime):
748 fmt = self.datetime_format
749 elif isinstance(val, date):
750 fmt = self.date_format
751 elif isinstance(val, timedelta):
752 val = val.total_seconds() / float(86400)
753 fmt = "0"
754 else:
755 val = str(val)
757 return val, fmt
759 @classmethod
760 def check_extension(cls, ext):
761 """checks that path's extension against the Writer's supported
762 extensions. If it isn't supported, raises UnsupportedFiletypeError."""
763 if ext.startswith("."):
764 ext = ext[1:]
765 if not any(ext in extension for extension in cls.supported_extensions):
766 msg = "Invalid extension for engine"
767 f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'"
768 raise ValueError(msg)
769 else:
770 return True
772 # Allow use as a contextmanager
773 def __enter__(self):
774 return self
776 def __exit__(self, exc_type, exc_value, traceback):
777 self.close()
779 def close(self):
780 """synonym for save, to make it more file-like"""
781 return self.save()
784class ExcelFile:
785 """
786 Class for parsing tabular excel sheets into DataFrame objects.
787 Uses xlrd. See read_excel for more documentation
789 Parameters
790 ----------
791 io : str, path object (pathlib.Path or py._path.local.LocalPath),
792 a file-like object, xlrd workbook or openpypl workbook.
793 If a string or path object, expected to be a path to xls, xlsx or odf file.
794 engine : str, default None
795 If io is not a buffer or path, this must be set to identify io.
796 Acceptable values are None, ``xlrd``, ``openpyxl``, ``odf``, or ``pyxlsb``.
797 Note that ``odf`` reads tables out of OpenDocument formatted files.
798 """
800 from pandas.io.excel._odfreader import _ODFReader
801 from pandas.io.excel._openpyxl import _OpenpyxlReader
802 from pandas.io.excel._xlrd import _XlrdReader
803 from pandas.io.excel._pyxlsb import _PyxlsbReader
805 _engines = {
806 "xlrd": _XlrdReader,
807 "openpyxl": _OpenpyxlReader,
808 "odf": _ODFReader,
809 "pyxlsb": _PyxlsbReader,
810 }
812 def __init__(self, io, engine=None):
813 if engine is None:
814 engine = "xlrd"
815 if engine not in self._engines:
816 raise ValueError(f"Unknown engine: {engine}")
818 self.engine = engine
819 # could be a str, ExcelFile, Book, etc.
820 self.io = io
821 # Always a string
822 self._io = stringify_path(io)
824 self._reader = self._engines[engine](self._io)
826 def __fspath__(self):
827 return self._io
829 def parse(
830 self,
831 sheet_name=0,
832 header=0,
833 names=None,
834 index_col=None,
835 usecols=None,
836 squeeze=False,
837 converters=None,
838 true_values=None,
839 false_values=None,
840 skiprows=None,
841 nrows=None,
842 na_values=None,
843 parse_dates=False,
844 date_parser=None,
845 thousands=None,
846 comment=None,
847 skipfooter=0,
848 convert_float=True,
849 mangle_dupe_cols=True,
850 **kwds,
851 ):
852 """
853 Parse specified sheet(s) into a DataFrame.
855 Equivalent to read_excel(ExcelFile, ...) See the read_excel
856 docstring for more info on accepted parameters.
858 Returns
859 -------
860 DataFrame or dict of DataFrames
861 DataFrame from the passed in Excel file.
862 """
863 if "chunksize" in kwds:
864 raise NotImplementedError(
865 "chunksize keyword of read_excel is not implemented"
866 )
868 return self._reader.parse(
869 sheet_name=sheet_name,
870 header=header,
871 names=names,
872 index_col=index_col,
873 usecols=usecols,
874 squeeze=squeeze,
875 converters=converters,
876 true_values=true_values,
877 false_values=false_values,
878 skiprows=skiprows,
879 nrows=nrows,
880 na_values=na_values,
881 parse_dates=parse_dates,
882 date_parser=date_parser,
883 thousands=thousands,
884 comment=comment,
885 skipfooter=skipfooter,
886 convert_float=convert_float,
887 mangle_dupe_cols=mangle_dupe_cols,
888 **kwds,
889 )
891 @property
892 def book(self):
893 return self._reader.book
895 @property
896 def sheet_names(self):
897 return self._reader.sheet_names
899 def close(self):
900 """close io if necessary"""
901 self._reader.close()
903 def __enter__(self):
904 return self
906 def __exit__(self, exc_type, exc_value, traceback):
907 self.close()
909 def __del__(self):
910 # Ensure we don't leak file descriptors, but put in try/except in case
911 # attributes are already deleted
912 try:
913 self.close()
914 except AttributeError:
915 pass