fastexcel

View Source

  1from __future__ import annotations
  2
  3import sys
  4import typing
  5from typing import TYPE_CHECKING, Callable, Literal
  6
  7if sys.version_info < (3, 10):
  8    from typing_extensions import TypeAlias
  9else:
 10    from typing import TypeAlias
 11
 12if TYPE_CHECKING:
 13    import pandas as pd
 14    import polars as pl
 15
 16from os.path import expanduser
 17from pathlib import Path
 18
 19import pyarrow as pa
 20
 21from ._fastexcel import (
 22    ArrowError,
 23    CalamineCellError,
 24    CalamineError,
 25    CannotRetrieveCellDataError,
 26    CellError,
 27    CellErrors,
 28    ColumnInfo,
 29    ColumnInfoNoDtype,
 30    ColumnNotFoundError,
 31    FastExcelError,
 32    InvalidParametersError,
 33    SheetNotFoundError,
 34    UnsupportedColumnTypeCombinationError,
 35    __version__,
 36    _ExcelReader,
 37    _ExcelSheet,
 38    _ExcelTable,
 39)
 40from ._fastexcel import read_excel as _read_excel
 41
 42DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
 43DTypeMap: TypeAlias = "dict[str | int, DType]"
 44ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"]
 45DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"]
 46SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"]
 47
 48
 49def _recordbatch_to_polars(rb: pa.RecordBatch) -> pl.DataFrame:
 50    import polars as pl
 51
 52    df = pl.from_arrow(data=rb)
 53    assert isinstance(df, pl.DataFrame)
 54    return df
 55
 56
 57class ExcelSheet:
 58    """A class representing a single sheet in an Excel File"""
 59
 60    def __init__(self, sheet: _ExcelSheet) -> None:
 61        self._sheet = sheet
 62
 63    @property
 64    def name(self) -> str:
 65        """The name of the sheet"""
 66        return self._sheet.name
 67
 68    @property
 69    def width(self) -> int:
 70        """The sheet's width"""
 71        return self._sheet.width
 72
 73    @property
 74    def height(self) -> int:
 75        """The sheet's height, with `skip_rows` and `nrows` applied"""
 76        return self._sheet.height
 77
 78    @property
 79    def total_height(self) -> int:
 80        """The sheet's total height"""
 81        return self._sheet.total_height
 82
 83    @property
 84    def selected_columns(self) -> list[ColumnInfo]:
 85        """The sheet's selected columns"""
 86        return self._sheet.selected_columns
 87
 88    def available_columns(self) -> list[ColumnInfo]:
 89        """The columns available for the given sheet"""
 90        return self._sheet.available_columns()
 91
 92    @property
 93    def specified_dtypes(self) -> DTypeMap | None:
 94        """The dtypes specified for the sheet"""
 95        return self._sheet.specified_dtypes
 96
 97    @property
 98    def visible(self) -> SheetVisible:
 99        """The visibility of the sheet"""
100        return self._sheet.visible
101
102    def to_arrow(self) -> pa.RecordBatch:
103        """Converts the sheet to a pyarrow `RecordBatch`"""
104        return self._sheet.to_arrow()
105
106    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
107        """Converts the sheet to a pyarrow `RecordBatch` with error information.
108
109        Stores the positions of any values that cannot be parsed as the specified type and were
110        therefore converted to None.
111        """
112        rb, cell_errors = self._sheet.to_arrow_with_errors()
113        if not cell_errors.errors:
114            return (rb, None)
115        return (rb, cell_errors)
116
117    def to_pandas(self) -> "pd.DataFrame":
118        """Converts the sheet to a Pandas `DataFrame`.
119
120        Requires the `pandas` extra to be installed.
121        """
122        # We know for sure that the sheet will yield exactly one RecordBatch
123        return self.to_arrow().to_pandas()
124
125    def to_polars(self) -> "pl.DataFrame":
126        """Converts the sheet to a Polars `DataFrame`.
127
128        Requires the `polars` extra to be installed.
129        """
130        return _recordbatch_to_polars(self.to_arrow())
131
132    def __repr__(self) -> str:
133        return self._sheet.__repr__()
134
135
136class ExcelTable:
137    """A class representing a single table in an Excel file"""
138
139    def __init__(self, table: _ExcelTable) -> None:
140        self._table = table
141
142    @property
143    def name(self) -> str:
144        """The name of the table"""
145        return self._table.name
146
147    @property
148    def sheet_name(self) -> str:
149        """The name of the sheet this table belongs to"""
150        return self._table.sheet_name
151
152    @property
153    def width(self) -> int:
154        """The table's width"""
155        return self._table.width
156
157    @property
158    def height(self) -> int:
159        """The table's height"""
160        return self._table.height
161
162    @property
163    def total_height(self) -> int:
164        """The table's total height"""
165        return self._table.total_height
166
167    @property
168    def offset(self) -> int:
169        """The table's offset before data starts"""
170        return self._table.offset
171
172    @property
173    def selected_columns(self) -> list[ColumnInfo]:
174        """The table's selected columns"""
175        return self._table.selected_columns
176
177    def available_columns(self) -> list[ColumnInfo]:
178        """The columns available for the given table"""
179        return self._table.available_columns()
180
181    @property
182    def specified_dtypes(self) -> DTypeMap | None:
183        """The dtypes specified for the table"""
184        return self._table.specified_dtypes
185
186    def to_arrow(self) -> pa.RecordBatch:
187        """Converts the table to a pyarrow `RecordBatch`"""
188        return self._table.to_arrow()
189
190    def to_pandas(self) -> "pd.DataFrame":
191        """Converts the table to a Pandas `DataFrame`.
192
193        Requires the `pandas` extra to be installed.
194        """
195        # We know for sure that the table will yield exactly one RecordBatch
196        return self.to_arrow().to_pandas()
197
198    def to_polars(self) -> "pl.DataFrame":
199        """Converts the table to a Polars `DataFrame`.
200
201        Requires the `polars` extra to be installed.
202        """
203        return _recordbatch_to_polars(self.to_arrow())
204
205
206class ExcelReader:
207    """A class representing an open Excel file and allowing to read its sheets"""
208
209    def __init__(self, reader: _ExcelReader) -> None:
210        self._reader = reader
211
212    @property
213    def sheet_names(self) -> list[str]:
214        """The list of sheet names"""
215        return self._reader.sheet_names
216
217    def load_sheet(
218        self,
219        idx_or_name: int | str,
220        *,
221        header_row: int | None = 0,
222        column_names: list[str] | None = None,
223        skip_rows: int | None = None,
224        n_rows: int | None = None,
225        schema_sample_rows: int | None = 1_000,
226        dtype_coercion: Literal["coerce", "strict"] = "coerce",
227        use_columns: list[str]
228        | list[int]
229        | str
230        | Callable[[ColumnInfoNoDtype], bool]
231        | None = None,
232        dtypes: DType | DTypeMap | None = None,
233    ) -> ExcelSheet:
234        """Loads a sheet lazily by index or name.
235
236        :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
237        :param header_row: The index of the row containing the column labels, default index is 0.
238                           If `None`, the sheet does not have any column labels.
239                           Any rows before the `header_row` will be automatically skipped.
240        :param column_names: Overrides headers found in the document.
241                             If `column_names` is used, `header_row` will be ignored.
242        :param n_rows: Specifies how many rows should be loaded.
243                       If `None`, all rows are loaded
244        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
245                          Any rows before the `header_row` are automatically skipped.
246                          If `header_row` is `None`:
247                            - if `skip_rows` is `None` (default): it skips all empty rows
248                            at the beginning of the sheet.
249                            - if `skip_rows` is a number, it skips the specified number
250                            of rows from the start of the sheet.
251        :param schema_sample_rows: Specifies how many rows should be used to determine
252                                   the dtype of a column. Cannot be 0. A specific dtype can be
253                                   enforced for some or all columns through the `dtypes` parameter.
254                                   If `None`, all rows will be used.
255        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
256                               will try to coerce different dtypes in a column to the same one,
257                               whereas `strict` will raise an error in case a column contains
258                               several dtypes. Note that this only applies to columns whose dtype
259                               is guessed, i.e. not specified via `dtypes`.
260        :param use_columns: Specifies the columns to use. Can either be:
261                            - `None` to select all columns
262                            - A list of strings and ints, the column names and/or indices
263                              (starting at 0)
264                            - A string, a comma separated list of Excel column letters and column
265                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
266                              `A,B,C,D,E` and `A,C,E,F`)
267                            - A callable, a function that takes a column and returns a boolean
268                              indicating whether the column should be used
269        :param dtypes: An optional dtype (for all columns)
270                       or dict of dtypes with keys as column indices or names.
271        """
272        return ExcelSheet(
273            self._reader.load_sheet(
274                idx_or_name=idx_or_name,
275                header_row=header_row,
276                column_names=column_names,
277                skip_rows=skip_rows,
278                n_rows=n_rows,
279                schema_sample_rows=schema_sample_rows,
280                dtype_coercion=dtype_coercion,
281                use_columns=use_columns,
282                dtypes=dtypes,
283                eager=False,
284            )
285        )
286
287    def table_names(self, sheet_name: str | None = None) -> list[str]:
288        """The list of table names.
289
290        Will return an empty list if no tables are found.
291
292        :param sheet_name: If given, will limit the list to the given sheet, will be faster
293        too.
294        """
295        return self._reader.table_names(sheet_name)
296
297    @typing.overload
298    def load_table(
299        self,
300        name: str,
301        *,
302        header_row: int | None = None,
303        column_names: list[str] | None = None,
304        skip_rows: int = 0,
305        n_rows: int | None = None,
306        schema_sample_rows: int | None = 1_000,
307        dtype_coercion: Literal["coerce", "strict"] = "coerce",
308        use_columns: list[str]
309        | list[int]
310        | str
311        | Callable[[ColumnInfoNoDtype], bool]
312        | None = None,
313        dtypes: DType | DTypeMap | None = None,
314        eager: Literal[False] = ...,
315    ) -> ExcelTable: ...
316    @typing.overload
317    def load_table(
318        self,
319        name: str,
320        *,
321        header_row: int | None = None,
322        column_names: list[str] | None = None,
323        skip_rows: int = 0,
324        n_rows: int | None = None,
325        schema_sample_rows: int | None = 1_000,
326        dtype_coercion: Literal["coerce", "strict"] = "coerce",
327        use_columns: list[str]
328        | list[int]
329        | str
330        | Callable[[ColumnInfoNoDtype], bool]
331        | None = None,
332        dtypes: DType | DTypeMap | None = None,
333        eager: Literal[True] = ...,
334    ) -> pa.RecordBatch: ...
335    def load_table(
336        self,
337        name: str,
338        *,
339        header_row: int | None = None,
340        column_names: list[str] | None = None,
341        skip_rows: int = 0,
342        n_rows: int | None = None,
343        schema_sample_rows: int | None = 1_000,
344        dtype_coercion: Literal["coerce", "strict"] = "coerce",
345        use_columns: list[str]
346        | list[int]
347        | str
348        | Callable[[ColumnInfoNoDtype], bool]
349        | None = None,
350        dtypes: DType | DTypeMap | None = None,
351        eager: bool = False,
352    ) -> ExcelTable | pa.RecordBatch:
353        """Loads a table by name.
354
355        :param name: The name of the table to load.
356        :param header_row: The index of the row containing the column labels.
357                           If `None`, the table's column names will be used.
358                           Any rows before the `header_row` will be automatically skipped.
359        :param column_names: Overrides headers found in the document.
360                             If `column_names` is used, `header_row` will be ignored.
361        :param n_rows: Specifies how many rows should be loaded.
362                       If `None`, all rows are loaded
363        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
364                          Any rows before the `header_row` are automatically skipped.
365                          If `header_row` is `None`, it skips the number of rows from the
366                          start of the sheet.
367        :param schema_sample_rows: Specifies how many rows should be used to determine
368                                   the dtype of a column. Cannot be 0. A specific dtype can be
369                                   enforced for some or all columns through the `dtypes` parameter.
370                                   If `None`, all rows will be used.
371        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
372                               will try to coerce different dtypes in a column to the same one,
373                               whereas `strict` will raise an error in case a column contains
374                               several dtypes. Note that this only applies to columns whose dtype
375                               is guessed, i.e. not specified via `dtypes`.
376        :param use_columns: Specifies the columns to use. Can either be:
377                            - `None` to select all columns
378                            - A list of strings and ints, the column names and/or indices
379                              (starting at 0)
380                            - A string, a comma separated list of Excel column letters and column
381                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
382                              `A,B,C,D,E` and `A,C,E,F`)
383                            - A callable, a function that takes a column and returns a boolean
384                              indicating whether the column should be used
385        :param dtypes: An optional dtype (for all columns)
386                       or dict of dtypes with keys as column indices or names.
387        """
388        output = self._reader.load_table(  # type:ignore[call-overload,misc]
389            name=name,
390            header_row=header_row,
391            column_names=column_names,
392            skip_rows=skip_rows,
393            n_rows=n_rows,
394            schema_sample_rows=schema_sample_rows,
395            dtype_coercion=dtype_coercion,
396            use_columns=use_columns,
397            dtypes=dtypes,
398            eager=eager,
399        )
400        if eager:
401            return output
402        return ExcelTable(output)
403
404    def load_sheet_eager(
405        self,
406        idx_or_name: int | str,
407        *,
408        header_row: int | None = 0,
409        column_names: list[str] | None = None,
410        skip_rows: int | None = None,
411        n_rows: int | None = None,
412        schema_sample_rows: int | None = 1_000,
413        dtype_coercion: Literal["coerce", "strict"] = "coerce",
414        use_columns: list[str] | list[int] | str | None = None,
415        dtypes: DType | DTypeMap | None = None,
416    ) -> pa.RecordBatch:
417        """Loads a sheet eagerly by index or name.
418
419        For xlsx files, this will be faster and more memory-efficient, as it will use
420        `worksheet_range_ref` under the hood, which returns borrowed types.
421
422        Refer to `load_sheet` for parameter documentation
423        """
424        return self._reader.load_sheet(
425            idx_or_name=idx_or_name,
426            header_row=header_row,
427            column_names=column_names,
428            skip_rows=skip_rows,
429            n_rows=n_rows,
430            schema_sample_rows=schema_sample_rows,
431            dtype_coercion=dtype_coercion,
432            use_columns=use_columns,
433            dtypes=dtypes,
434            eager=True,
435        )
436
437    def load_sheet_by_name(
438        self,
439        name: str,
440        *,
441        header_row: int | None = 0,
442        column_names: list[str] | None = None,
443        skip_rows: int | None = None,
444        n_rows: int | None = None,
445        schema_sample_rows: int | None = 1_000,
446        dtype_coercion: Literal["coerce", "strict"] = "coerce",
447        use_columns: list[str]
448        | list[int]
449        | str
450        | Callable[[ColumnInfoNoDtype], bool]
451        | None = None,
452        dtypes: DType | DTypeMap | None = None,
453    ) -> ExcelSheet:
454        """Loads a sheet by name.
455
456        Refer to `load_sheet` for parameter documentation
457        """
458        return self.load_sheet(
459            name,
460            header_row=header_row,
461            column_names=column_names,
462            skip_rows=skip_rows,
463            n_rows=n_rows,
464            schema_sample_rows=schema_sample_rows,
465            dtype_coercion=dtype_coercion,
466            use_columns=use_columns,
467            dtypes=dtypes,
468        )
469
470    def load_sheet_by_idx(
471        self,
472        idx: int,
473        *,
474        header_row: int | None = 0,
475        column_names: list[str] | None = None,
476        skip_rows: int | None = None,
477        n_rows: int | None = None,
478        schema_sample_rows: int | None = 1_000,
479        dtype_coercion: Literal["coerce", "strict"] = "coerce",
480        use_columns: list[str]
481        | list[int]
482        | str
483        | Callable[[ColumnInfoNoDtype], bool]
484        | None = None,
485        dtypes: DType | DTypeMap | None = None,
486    ) -> ExcelSheet:
487        """Loads a sheet by index.
488
489        Refer to `load_sheet` for parameter documentation
490        """
491        return self.load_sheet(
492            idx,
493            header_row=header_row,
494            column_names=column_names,
495            skip_rows=skip_rows,
496            n_rows=n_rows,
497            schema_sample_rows=schema_sample_rows,
498            dtype_coercion=dtype_coercion,
499            use_columns=use_columns,
500            dtypes=dtypes,
501        )
502
503    def __repr__(self) -> str:
504        return self._reader.__repr__()
505
506
507def read_excel(source: Path | str | bytes) -> ExcelReader:
508    """Opens and loads an excel file.
509
510    :param source: The path to a file or its content as bytes
511    """
512    if isinstance(source, (str, Path)):
513        source = expanduser(source)
514    return ExcelReader(_read_excel(source))
515
516
517__all__ = (
518    ## version
519    "__version__",
520    ## main entrypoint
521    "read_excel",
522    ## Python types
523    "DType",
524    "DTypeMap",
525    # Excel reader
526    "ExcelReader",
527    # Excel sheet
528    "ExcelSheet",
529    # Column metadata
530    "DTypeFrom",
531    "ColumnNameFrom",
532    "ColumnInfo",
533    # Parse error information
534    "CellError",
535    "CellErrors",
536    # Exceptions
537    "FastExcelError",
538    "CannotRetrieveCellDataError",
539    "CalamineCellError",
540    "CalamineError",
541    "SheetNotFoundError",
542    "ColumnNotFoundError",
543    "ArrowError",
544    "InvalidParametersError",
545    "UnsupportedColumnTypeCombinationError",
546)

__version__ = '0.14.0'

def read_excel(source: pathlib.Path | str | bytes) -> ExcelReader: View Source

508def read_excel(source: Path | str | bytes) -> ExcelReader:
509    """Opens and loads an excel file.
510
511    :param source: The path to a file or its content as bytes
512    """
513    if isinstance(source, (str, Path)):
514        source = expanduser(source)
515    return ExcelReader(_read_excel(source))

Opens and loads an excel file.

Parameters

source: The path to a file or its content as bytes

DType = typing.Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']

DTypeMap: TypeAlias = 'dict[str | int, DType]'

class ExcelReader: View Source

207class ExcelReader:
208    """A class representing an open Excel file and allowing to read its sheets"""
209
210    def __init__(self, reader: _ExcelReader) -> None:
211        self._reader = reader
212
213    @property
214    def sheet_names(self) -> list[str]:
215        """The list of sheet names"""
216        return self._reader.sheet_names
217
218    def load_sheet(
219        self,
220        idx_or_name: int | str,
221        *,
222        header_row: int | None = 0,
223        column_names: list[str] | None = None,
224        skip_rows: int | None = None,
225        n_rows: int | None = None,
226        schema_sample_rows: int | None = 1_000,
227        dtype_coercion: Literal["coerce", "strict"] = "coerce",
228        use_columns: list[str]
229        | list[int]
230        | str
231        | Callable[[ColumnInfoNoDtype], bool]
232        | None = None,
233        dtypes: DType | DTypeMap | None = None,
234    ) -> ExcelSheet:
235        """Loads a sheet lazily by index or name.
236
237        :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
238        :param header_row: The index of the row containing the column labels, default index is 0.
239                           If `None`, the sheet does not have any column labels.
240                           Any rows before the `header_row` will be automatically skipped.
241        :param column_names: Overrides headers found in the document.
242                             If `column_names` is used, `header_row` will be ignored.
243        :param n_rows: Specifies how many rows should be loaded.
244                       If `None`, all rows are loaded
245        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
246                          Any rows before the `header_row` are automatically skipped.
247                          If `header_row` is `None`:
248                            - if `skip_rows` is `None` (default): it skips all empty rows
249                            at the beginning of the sheet.
250                            - if `skip_rows` is a number, it skips the specified number
251                            of rows from the start of the sheet.
252        :param schema_sample_rows: Specifies how many rows should be used to determine
253                                   the dtype of a column. Cannot be 0. A specific dtype can be
254                                   enforced for some or all columns through the `dtypes` parameter.
255                                   If `None`, all rows will be used.
256        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
257                               will try to coerce different dtypes in a column to the same one,
258                               whereas `strict` will raise an error in case a column contains
259                               several dtypes. Note that this only applies to columns whose dtype
260                               is guessed, i.e. not specified via `dtypes`.
261        :param use_columns: Specifies the columns to use. Can either be:
262                            - `None` to select all columns
263                            - A list of strings and ints, the column names and/or indices
264                              (starting at 0)
265                            - A string, a comma separated list of Excel column letters and column
266                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
267                              `A,B,C,D,E` and `A,C,E,F`)
268                            - A callable, a function that takes a column and returns a boolean
269                              indicating whether the column should be used
270        :param dtypes: An optional dtype (for all columns)
271                       or dict of dtypes with keys as column indices or names.
272        """
273        return ExcelSheet(
274            self._reader.load_sheet(
275                idx_or_name=idx_or_name,
276                header_row=header_row,
277                column_names=column_names,
278                skip_rows=skip_rows,
279                n_rows=n_rows,
280                schema_sample_rows=schema_sample_rows,
281                dtype_coercion=dtype_coercion,
282                use_columns=use_columns,
283                dtypes=dtypes,
284                eager=False,
285            )
286        )
287
288    def table_names(self, sheet_name: str | None = None) -> list[str]:
289        """The list of table names.
290
291        Will return an empty list if no tables are found.
292
293        :param sheet_name: If given, will limit the list to the given sheet, will be faster
294        too.
295        """
296        return self._reader.table_names(sheet_name)
297
298    @typing.overload
299    def load_table(
300        self,
301        name: str,
302        *,
303        header_row: int | None = None,
304        column_names: list[str] | None = None,
305        skip_rows: int = 0,
306        n_rows: int | None = None,
307        schema_sample_rows: int | None = 1_000,
308        dtype_coercion: Literal["coerce", "strict"] = "coerce",
309        use_columns: list[str]
310        | list[int]
311        | str
312        | Callable[[ColumnInfoNoDtype], bool]
313        | None = None,
314        dtypes: DType | DTypeMap | None = None,
315        eager: Literal[False] = ...,
316    ) -> ExcelTable: ...
317    @typing.overload
318    def load_table(
319        self,
320        name: str,
321        *,
322        header_row: int | None = None,
323        column_names: list[str] | None = None,
324        skip_rows: int = 0,
325        n_rows: int | None = None,
326        schema_sample_rows: int | None = 1_000,
327        dtype_coercion: Literal["coerce", "strict"] = "coerce",
328        use_columns: list[str]
329        | list[int]
330        | str
331        | Callable[[ColumnInfoNoDtype], bool]
332        | None = None,
333        dtypes: DType | DTypeMap | None = None,
334        eager: Literal[True] = ...,
335    ) -> pa.RecordBatch: ...
336    def load_table(
337        self,
338        name: str,
339        *,
340        header_row: int | None = None,
341        column_names: list[str] | None = None,
342        skip_rows: int = 0,
343        n_rows: int | None = None,
344        schema_sample_rows: int | None = 1_000,
345        dtype_coercion: Literal["coerce", "strict"] = "coerce",
346        use_columns: list[str]
347        | list[int]
348        | str
349        | Callable[[ColumnInfoNoDtype], bool]
350        | None = None,
351        dtypes: DType | DTypeMap | None = None,
352        eager: bool = False,
353    ) -> ExcelTable | pa.RecordBatch:
354        """Loads a table by name.
355
356        :param name: The name of the table to load.
357        :param header_row: The index of the row containing the column labels.
358                           If `None`, the table's column names will be used.
359                           Any rows before the `header_row` will be automatically skipped.
360        :param column_names: Overrides headers found in the document.
361                             If `column_names` is used, `header_row` will be ignored.
362        :param n_rows: Specifies how many rows should be loaded.
363                       If `None`, all rows are loaded
364        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
365                          Any rows before the `header_row` are automatically skipped.
366                          If `header_row` is `None`, it skips the number of rows from the
367                          start of the sheet.
368        :param schema_sample_rows: Specifies how many rows should be used to determine
369                                   the dtype of a column. Cannot be 0. A specific dtype can be
370                                   enforced for some or all columns through the `dtypes` parameter.
371                                   If `None`, all rows will be used.
372        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
373                               will try to coerce different dtypes in a column to the same one,
374                               whereas `strict` will raise an error in case a column contains
375                               several dtypes. Note that this only applies to columns whose dtype
376                               is guessed, i.e. not specified via `dtypes`.
377        :param use_columns: Specifies the columns to use. Can either be:
378                            - `None` to select all columns
379                            - A list of strings and ints, the column names and/or indices
380                              (starting at 0)
381                            - A string, a comma separated list of Excel column letters and column
382                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
383                              `A,B,C,D,E` and `A,C,E,F`)
384                            - A callable, a function that takes a column and returns a boolean
385                              indicating whether the column should be used
386        :param dtypes: An optional dtype (for all columns)
387                       or dict of dtypes with keys as column indices or names.
388        """
389        output = self._reader.load_table(  # type:ignore[call-overload,misc]
390            name=name,
391            header_row=header_row,
392            column_names=column_names,
393            skip_rows=skip_rows,
394            n_rows=n_rows,
395            schema_sample_rows=schema_sample_rows,
396            dtype_coercion=dtype_coercion,
397            use_columns=use_columns,
398            dtypes=dtypes,
399            eager=eager,
400        )
401        if eager:
402            return output
403        return ExcelTable(output)
404
405    def load_sheet_eager(
406        self,
407        idx_or_name: int | str,
408        *,
409        header_row: int | None = 0,
410        column_names: list[str] | None = None,
411        skip_rows: int | None = None,
412        n_rows: int | None = None,
413        schema_sample_rows: int | None = 1_000,
414        dtype_coercion: Literal["coerce", "strict"] = "coerce",
415        use_columns: list[str] | list[int] | str | None = None,
416        dtypes: DType | DTypeMap | None = None,
417    ) -> pa.RecordBatch:
418        """Loads a sheet eagerly by index or name.
419
420        For xlsx files, this will be faster and more memory-efficient, as it will use
421        `worksheet_range_ref` under the hood, which returns borrowed types.
422
423        Refer to `load_sheet` for parameter documentation
424        """
425        return self._reader.load_sheet(
426            idx_or_name=idx_or_name,
427            header_row=header_row,
428            column_names=column_names,
429            skip_rows=skip_rows,
430            n_rows=n_rows,
431            schema_sample_rows=schema_sample_rows,
432            dtype_coercion=dtype_coercion,
433            use_columns=use_columns,
434            dtypes=dtypes,
435            eager=True,
436        )
437
438    def load_sheet_by_name(
439        self,
440        name: str,
441        *,
442        header_row: int | None = 0,
443        column_names: list[str] | None = None,
444        skip_rows: int | None = None,
445        n_rows: int | None = None,
446        schema_sample_rows: int | None = 1_000,
447        dtype_coercion: Literal["coerce", "strict"] = "coerce",
448        use_columns: list[str]
449        | list[int]
450        | str
451        | Callable[[ColumnInfoNoDtype], bool]
452        | None = None,
453        dtypes: DType | DTypeMap | None = None,
454    ) -> ExcelSheet:
455        """Loads a sheet by name.
456
457        Refer to `load_sheet` for parameter documentation
458        """
459        return self.load_sheet(
460            name,
461            header_row=header_row,
462            column_names=column_names,
463            skip_rows=skip_rows,
464            n_rows=n_rows,
465            schema_sample_rows=schema_sample_rows,
466            dtype_coercion=dtype_coercion,
467            use_columns=use_columns,
468            dtypes=dtypes,
469        )
470
471    def load_sheet_by_idx(
472        self,
473        idx: int,
474        *,
475        header_row: int | None = 0,
476        column_names: list[str] | None = None,
477        skip_rows: int | None = None,
478        n_rows: int | None = None,
479        schema_sample_rows: int | None = 1_000,
480        dtype_coercion: Literal["coerce", "strict"] = "coerce",
481        use_columns: list[str]
482        | list[int]
483        | str
484        | Callable[[ColumnInfoNoDtype], bool]
485        | None = None,
486        dtypes: DType | DTypeMap | None = None,
487    ) -> ExcelSheet:
488        """Loads a sheet by index.
489
490        Refer to `load_sheet` for parameter documentation
491        """
492        return self.load_sheet(
493            idx,
494            header_row=header_row,
495            column_names=column_names,
496            skip_rows=skip_rows,
497            n_rows=n_rows,
498            schema_sample_rows=schema_sample_rows,
499            dtype_coercion=dtype_coercion,
500            use_columns=use_columns,
501            dtypes=dtypes,
502        )
503
504    def __repr__(self) -> str:
505        return self._reader.__repr__()

A class representing an open Excel file and allowing to read its sheets

ExcelReader(reader: _ExcelReader) View Source

210    def __init__(self, reader: _ExcelReader) -> None:
211        self._reader = reader

sheet_names: list[str] View Source

213    @property
214    def sheet_names(self) -> list[str]:
215        """The list of sheet names"""
216        return self._reader.sheet_names

The list of sheet names

def load_sheet( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: Union[list[str], list[int], str, Callable[[ColumnInfoNoDtype], bool], NoneType] = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None) -> ExcelSheet: View Source

218    def load_sheet(
219        self,
220        idx_or_name: int | str,
221        *,
222        header_row: int | None = 0,
223        column_names: list[str] | None = None,
224        skip_rows: int | None = None,
225        n_rows: int | None = None,
226        schema_sample_rows: int | None = 1_000,
227        dtype_coercion: Literal["coerce", "strict"] = "coerce",
228        use_columns: list[str]
229        | list[int]
230        | str
231        | Callable[[ColumnInfoNoDtype], bool]
232        | None = None,
233        dtypes: DType | DTypeMap | None = None,
234    ) -> ExcelSheet:
235        """Loads a sheet lazily by index or name.
236
237        :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
238        :param header_row: The index of the row containing the column labels, default index is 0.
239                           If `None`, the sheet does not have any column labels.
240                           Any rows before the `header_row` will be automatically skipped.
241        :param column_names: Overrides headers found in the document.
242                             If `column_names` is used, `header_row` will be ignored.
243        :param n_rows: Specifies how many rows should be loaded.
244                       If `None`, all rows are loaded
245        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
246                          Any rows before the `header_row` are automatically skipped.
247                          If `header_row` is `None`:
248                            - if `skip_rows` is `None` (default): it skips all empty rows
249                            at the beginning of the sheet.
250                            - if `skip_rows` is a number, it skips the specified number
251                            of rows from the start of the sheet.
252        :param schema_sample_rows: Specifies how many rows should be used to determine
253                                   the dtype of a column. Cannot be 0. A specific dtype can be
254                                   enforced for some or all columns through the `dtypes` parameter.
255                                   If `None`, all rows will be used.
256        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
257                               will try to coerce different dtypes in a column to the same one,
258                               whereas `strict` will raise an error in case a column contains
259                               several dtypes. Note that this only applies to columns whose dtype
260                               is guessed, i.e. not specified via `dtypes`.
261        :param use_columns: Specifies the columns to use. Can either be:
262                            - `None` to select all columns
263                            - A list of strings and ints, the column names and/or indices
264                              (starting at 0)
265                            - A string, a comma separated list of Excel column letters and column
266                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
267                              `A,B,C,D,E` and `A,C,E,F`)
268                            - A callable, a function that takes a column and returns a boolean
269                              indicating whether the column should be used
270        :param dtypes: An optional dtype (for all columns)
271                       or dict of dtypes with keys as column indices or names.
272        """
273        return ExcelSheet(
274            self._reader.load_sheet(
275                idx_or_name=idx_or_name,
276                header_row=header_row,
277                column_names=column_names,
278                skip_rows=skip_rows,
279                n_rows=n_rows,
280                schema_sample_rows=schema_sample_rows,
281                dtype_coercion=dtype_coercion,
282                use_columns=use_columns,
283                dtypes=dtypes,
284                eager=False,
285            )
286        )

Loads a sheet lazily by index or name.

Parameters

idx_or_name: The index (starting at 0) or the name of the sheet to load.
header_row: The index of the row containing the column labels, default index is 0. If None, the sheet does not have any column labels. Any rows before the header_row will be automatically skipped.
column_names: Overrides headers found in the document. If column_names is used, header_row will be ignored.
n_rows: Specifies how many rows should be loaded. If None, all rows are loaded
skip_rows: Specifies how many rows should be skipped after the header_row. Any rows before the header_row are automatically skipped. If header_row is None:
- if skip_rows is None (default): it skips all empty rows at the beginning of the sheet.
- if skip_rows is a number, it skips the specified number of rows from the start of the sheet.
schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. Cannot be 0. A specific dtype can be enforced for some or all columns through the dtypes parameter. If None, all rows will be used.
dtype_coercion: Specifies how type coercion should behave. coerce (the default) will try to coerce different dtypes in a column to the same one, whereas strict will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified via dtypes.
use_columns: Specifies the columns to use. Can either be:
- None to select all columns
- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”, which would result in A,B,C,D,E and A,C,E,F)
- A callable, a function that takes a column and returns a boolean indicating whether the column should be used
dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.

def table_names(self, sheet_name: str | None = None) -> list[str]: View Source

288    def table_names(self, sheet_name: str | None = None) -> list[str]:
289        """The list of table names.
290
291        Will return an empty list if no tables are found.
292
293        :param sheet_name: If given, will limit the list to the given sheet, will be faster
294        too.
295        """
296        return self._reader.table_names(sheet_name)

The list of table names.

Will return an empty list if no tables are found.

Parameters

sheet_name: If given, will limit the list to the given sheet, will be faster too.

def load_table( self, name: str, *, header_row: int | None = None, column_names: list[str] | None = None, skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: Union[list[str], list[int], str, Callable[[ColumnInfoNoDtype], bool], NoneType] = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None, eager: bool = False) -> fastexcel.ExcelTable | pyarrow.lib.RecordBatch: View Source

336    def load_table(
337        self,
338        name: str,
339        *,
340        header_row: int | None = None,
341        column_names: list[str] | None = None,
342        skip_rows: int = 0,
343        n_rows: int | None = None,
344        schema_sample_rows: int | None = 1_000,
345        dtype_coercion: Literal["coerce", "strict"] = "coerce",
346        use_columns: list[str]
347        | list[int]
348        | str
349        | Callable[[ColumnInfoNoDtype], bool]
350        | None = None,
351        dtypes: DType | DTypeMap | None = None,
352        eager: bool = False,
353    ) -> ExcelTable | pa.RecordBatch:
354        """Loads a table by name.
355
356        :param name: The name of the table to load.
357        :param header_row: The index of the row containing the column labels.
358                           If `None`, the table's column names will be used.
359                           Any rows before the `header_row` will be automatically skipped.
360        :param column_names: Overrides headers found in the document.
361                             If `column_names` is used, `header_row` will be ignored.
362        :param n_rows: Specifies how many rows should be loaded.
363                       If `None`, all rows are loaded
364        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
365                          Any rows before the `header_row` are automatically skipped.
366                          If `header_row` is `None`, it skips the number of rows from the
367                          start of the sheet.
368        :param schema_sample_rows: Specifies how many rows should be used to determine
369                                   the dtype of a column. Cannot be 0. A specific dtype can be
370                                   enforced for some or all columns through the `dtypes` parameter.
371                                   If `None`, all rows will be used.
372        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
373                               will try to coerce different dtypes in a column to the same one,
374                               whereas `strict` will raise an error in case a column contains
375                               several dtypes. Note that this only applies to columns whose dtype
376                               is guessed, i.e. not specified via `dtypes`.
377        :param use_columns: Specifies the columns to use. Can either be:
378                            - `None` to select all columns
379                            - A list of strings and ints, the column names and/or indices
380                              (starting at 0)
381                            - A string, a comma separated list of Excel column letters and column
382                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
383                              `A,B,C,D,E` and `A,C,E,F`)
384                            - A callable, a function that takes a column and returns a boolean
385                              indicating whether the column should be used
386        :param dtypes: An optional dtype (for all columns)
387                       or dict of dtypes with keys as column indices or names.
388        """
389        output = self._reader.load_table(  # type:ignore[call-overload,misc]
390            name=name,
391            header_row=header_row,
392            column_names=column_names,
393            skip_rows=skip_rows,
394            n_rows=n_rows,
395            schema_sample_rows=schema_sample_rows,
396            dtype_coercion=dtype_coercion,
397            use_columns=use_columns,
398            dtypes=dtypes,
399            eager=eager,
400        )
401        if eager:
402            return output
403        return ExcelTable(output)

Loads a table by name.

Parameters

name: The name of the table to load.
header_row: The index of the row containing the column labels. If None, the table's column names will be used. Any rows before the header_row will be automatically skipped.
column_names: Overrides headers found in the document. If column_names is used, header_row will be ignored.
n_rows: Specifies how many rows should be loaded. If None, all rows are loaded
skip_rows: Specifies how many rows should be skipped after the header_row. Any rows before the header_row are automatically skipped. If header_row is None, it skips the number of rows from the start of the sheet.
schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. Cannot be 0. A specific dtype can be enforced for some or all columns through the dtypes parameter. If None, all rows will be used.
dtype_coercion: Specifies how type coercion should behave. coerce (the default) will try to coerce different dtypes in a column to the same one, whereas strict will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified via dtypes.
use_columns: Specifies the columns to use. Can either be:
- None to select all columns
- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”, which would result in A,B,C,D,E and A,C,E,F)
- A callable, a function that takes a column and returns a boolean indicating whether the column should be used
dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.

def load_sheet_eager( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: list[str] | list[int] | str | None = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None) -> pyarrow.lib.RecordBatch: View Source

405    def load_sheet_eager(
406        self,
407        idx_or_name: int | str,
408        *,
409        header_row: int | None = 0,
410        column_names: list[str] | None = None,
411        skip_rows: int | None = None,
412        n_rows: int | None = None,
413        schema_sample_rows: int | None = 1_000,
414        dtype_coercion: Literal["coerce", "strict"] = "coerce",
415        use_columns: list[str] | list[int] | str | None = None,
416        dtypes: DType | DTypeMap | None = None,
417    ) -> pa.RecordBatch:
418        """Loads a sheet eagerly by index or name.
419
420        For xlsx files, this will be faster and more memory-efficient, as it will use
421        `worksheet_range_ref` under the hood, which returns borrowed types.
422
423        Refer to `load_sheet` for parameter documentation
424        """
425        return self._reader.load_sheet(
426            idx_or_name=idx_or_name,
427            header_row=header_row,
428            column_names=column_names,
429            skip_rows=skip_rows,
430            n_rows=n_rows,
431            schema_sample_rows=schema_sample_rows,
432            dtype_coercion=dtype_coercion,
433            use_columns=use_columns,
434            dtypes=dtypes,
435            eager=True,
436        )

Loads a sheet eagerly by index or name.

For xlsx files, this will be faster and more memory-efficient, as it will use worksheet_range_ref under the hood, which returns borrowed types.

Refer to load_sheet for parameter documentation

def load_sheet_by_name( self, name: str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: Union[list[str], list[int], str, Callable[[ColumnInfoNoDtype], bool], NoneType] = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None) -> ExcelSheet: View Source

438    def load_sheet_by_name(
439        self,
440        name: str,
441        *,
442        header_row: int | None = 0,
443        column_names: list[str] | None = None,
444        skip_rows: int | None = None,
445        n_rows: int | None = None,
446        schema_sample_rows: int | None = 1_000,
447        dtype_coercion: Literal["coerce", "strict"] = "coerce",
448        use_columns: list[str]
449        | list[int]
450        | str
451        | Callable[[ColumnInfoNoDtype], bool]
452        | None = None,
453        dtypes: DType | DTypeMap | None = None,
454    ) -> ExcelSheet:
455        """Loads a sheet by name.
456
457        Refer to `load_sheet` for parameter documentation
458        """
459        return self.load_sheet(
460            name,
461            header_row=header_row,
462            column_names=column_names,
463            skip_rows=skip_rows,
464            n_rows=n_rows,
465            schema_sample_rows=schema_sample_rows,
466            dtype_coercion=dtype_coercion,
467            use_columns=use_columns,
468            dtypes=dtypes,
469        )

Loads a sheet by name.

Refer to load_sheet for parameter documentation

def load_sheet_by_idx( self, idx: int, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: Union[list[str], list[int], str, Callable[[ColumnInfoNoDtype], bool], NoneType] = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None) -> ExcelSheet: View Source

471    def load_sheet_by_idx(
472        self,
473        idx: int,
474        *,
475        header_row: int | None = 0,
476        column_names: list[str] | None = None,
477        skip_rows: int | None = None,
478        n_rows: int | None = None,
479        schema_sample_rows: int | None = 1_000,
480        dtype_coercion: Literal["coerce", "strict"] = "coerce",
481        use_columns: list[str]
482        | list[int]
483        | str
484        | Callable[[ColumnInfoNoDtype], bool]
485        | None = None,
486        dtypes: DType | DTypeMap | None = None,
487    ) -> ExcelSheet:
488        """Loads a sheet by index.
489
490        Refer to `load_sheet` for parameter documentation
491        """
492        return self.load_sheet(
493            idx,
494            header_row=header_row,
495            column_names=column_names,
496            skip_rows=skip_rows,
497            n_rows=n_rows,
498            schema_sample_rows=schema_sample_rows,
499            dtype_coercion=dtype_coercion,
500            use_columns=use_columns,
501            dtypes=dtypes,
502        )

Loads a sheet by index.

Refer to load_sheet for parameter documentation

DTypeFrom: TypeAlias = Literal['provided_by_index', 'provided_by_name', 'guessed']

ColumnNameFrom: TypeAlias = Literal['provided', 'looked_up', 'generated']

class ColumnInfo:

This class provides information about a single column in a sheet

index

int. The index of the column

dtype_from

fastexcel.DTypeFrom. How the dtype of the column was determined.

One of three possible values:

"provided_by_index": The dtype was specified via the column index
"provided_by_name": The dtype was specified via the column name
"guessed": The dtype was determined from the content of the column

name

str. The name of the column

dtype

fastexcel.DType. The dtype of the column

column_name_from

fastexcel.ColumnNameFrom. How the name of the column was determined.

One of three possible values:

"provided": The column name was provided via the use_columns parameter
"looked_up": The column name was looked up from the data found in the sheet
"generated": The column name was generated from the column index, either because header_row was None, or because it could not be looked up

class CellError:

detail

str. The error message

position

(int, int). The original row and column of the error

offset_position

row_offset

int. The row offset

class CellErrors:

errors

class FastExcelError(builtins.Exception):

The base class for all fastexcel errors

class CannotRetrieveCellDataError(fastexcel.FastExcelError):

Data for a given cell cannot be retrieved

class CalamineCellError(fastexcel.FastExcelError):

calamine returned an error regarding the content of the cell

class CalamineError(fastexcel.FastExcelError):

Generic calamine error

class SheetNotFoundError(fastexcel.FastExcelError):

Sheet was not found

class ColumnNotFoundError(fastexcel.FastExcelError):

Column was not found

class ArrowError(fastexcel.FastExcelError):

Generic arrow error

class InvalidParametersError(fastexcel.FastExcelError):

Provided parameters are invalid

class UnsupportedColumnTypeCombinationError(fastexcel.FastExcelError):

Column contains an unsupported type combination