fastexcel

  1from __future__ import annotations
  2
  3import typing
  4from collections.abc import Callable
  5from typing import TYPE_CHECKING, Literal, TypeAlias
  6
  7if TYPE_CHECKING:
  8    import pandas as pd
  9    import polars as pl
 10    import pyarrow as pa
 11
 12from os.path import expanduser
 13from pathlib import Path
 14
 15try:
 16    import importlib.util
 17
 18    importlib.util.find_spec("pyarrow")
 19    _PYARROW_AVAILABLE = True
 20except ImportError:
 21    _PYARROW_AVAILABLE = False
 22
 23from ._fastexcel import (
 24    ArrowError,
 25    CalamineCellError,
 26    CalamineError,
 27    CannotRetrieveCellDataError,
 28    CellError,
 29    CellErrors,
 30    ColumnInfo,
 31    ColumnInfoNoDtype,
 32    ColumnNotFoundError,
 33    DefinedName,
 34    FastExcelError,
 35    InvalidParametersError,
 36    SheetNotFoundError,
 37    UnsupportedColumnTypeCombinationError,
 38    __version__,
 39    _ExcelReader,
 40    _ExcelSheet,
 41    _ExcelTable,
 42)
 43from ._fastexcel import read_excel as _read_excel
 44
 45DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
 46DTypeMap: TypeAlias = "dict[str | int, DType]"
 47ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"]
 48DTypeFrom: TypeAlias = Literal[
 49    "provided_for_all", "provided_by_index", "provided_by_name", "guessed"
 50]
 51SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"]
 52
 53
 54class ExcelSheet:
 55    """A class representing a single sheet in an Excel File"""
 56
 57    def __init__(self, sheet: _ExcelSheet) -> None:
 58        self._sheet = sheet
 59
 60    @property
 61    def name(self) -> str:
 62        """The name of the sheet"""
 63        return self._sheet.name
 64
 65    @property
 66    def width(self) -> int:
 67        """The sheet's width"""
 68        return self._sheet.width
 69
 70    @property
 71    def height(self) -> int:
 72        """The sheet's height, with `skip_rows` and `nrows` applied"""
 73        return self._sheet.height
 74
 75    @property
 76    def total_height(self) -> int:
 77        """The sheet's total height"""
 78        return self._sheet.total_height
 79
 80    @property
 81    def selected_columns(self) -> list[ColumnInfo]:
 82        """The sheet's selected columns"""
 83        return self._sheet.selected_columns
 84
 85    def available_columns(self) -> list[ColumnInfo]:
 86        """The columns available for the given sheet"""
 87        return self._sheet.available_columns()
 88
 89    @property
 90    def specified_dtypes(self) -> DTypeMap | None:
 91        """The dtypes specified for the sheet"""
 92        return self._sheet.specified_dtypes
 93
 94    @property
 95    def visible(self) -> SheetVisible:
 96        """The visibility of the sheet"""
 97        return self._sheet.visible
 98
 99    def to_arrow(self) -> pa.RecordBatch:
100        """Converts the sheet to a pyarrow `RecordBatch`
101
102        Requires the `pyarrow` extra to be installed.
103        """
104        if not _PYARROW_AVAILABLE:
105            raise ImportError(
106                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
107            )
108        return self._sheet.to_arrow()
109
110    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
111        """Converts the sheet to a pyarrow `RecordBatch` with error information.
112
113        Stores the positions of any values that cannot be parsed as the specified type and were
114        therefore converted to None.
115
116        Requires the `pyarrow` extra to be installed.
117        """
118        if not _PYARROW_AVAILABLE:
119            raise ImportError(
120                "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'"  # noqa: E501
121            )
122        rb, cell_errors = self._sheet.to_arrow_with_errors()
123        if not cell_errors.errors:
124            return (rb, None)
125        return (rb, cell_errors)
126
127    def to_pandas(self) -> pd.DataFrame:
128        """Converts the sheet to a Pandas `DataFrame`.
129
130        Requires the `pandas` extra to be installed.
131        """
132        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
133        # which we don't implement. Using pyarrow conversion for now.
134        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
135        return self.to_arrow().to_pandas()
136
137    def to_polars(self) -> pl.DataFrame:
138        """Converts the sheet to a Polars `DataFrame`.
139
140        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
141        Requires the `polars` extra to be installed.
142        """
143        import polars as pl
144
145        return pl.DataFrame(self)
146
147    def __arrow_c_schema__(self) -> object:
148        """Export the schema as an `ArrowSchema` `PyCapsule`.
149
150        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
151
152        The Arrow PyCapsule Interface enables zero-copy data exchange with
153        Arrow-compatible libraries without requiring PyArrow as a dependency.
154        """
155        return self._sheet.__arrow_c_schema__()
156
157    def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
158        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
159
160        The optional `requested_schema` parameter allows for potential schema conversion.
161
162        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
163
164        The Arrow PyCapsule Interface enables zero-copy data exchange with
165        Arrow-compatible libraries without requiring PyArrow as a dependency.
166        """
167        return self._sheet.__arrow_c_array__(requested_schema)
168
169    def __repr__(self) -> str:
170        return self._sheet.__repr__()
171
172
173class ExcelTable:
174    """A class representing a single table in an Excel file"""
175
176    def __init__(self, table: _ExcelTable) -> None:
177        self._table = table
178
179    @property
180    def name(self) -> str:
181        """The name of the table"""
182        return self._table.name
183
184    @property
185    def sheet_name(self) -> str:
186        """The name of the sheet this table belongs to"""
187        return self._table.sheet_name
188
189    @property
190    def width(self) -> int:
191        """The table's width"""
192        return self._table.width
193
194    @property
195    def height(self) -> int:
196        """The table's height"""
197        return self._table.height
198
199    @property
200    def total_height(self) -> int:
201        """The table's total height"""
202        return self._table.total_height
203
204    @property
205    def offset(self) -> int:
206        """The table's offset before data starts"""
207        return self._table.offset
208
209    @property
210    def selected_columns(self) -> list[ColumnInfo]:
211        """The table's selected columns"""
212        return self._table.selected_columns
213
214    def available_columns(self) -> list[ColumnInfo]:
215        """The columns available for the given table"""
216        return self._table.available_columns()
217
218    @property
219    def specified_dtypes(self) -> DTypeMap | None:
220        """The dtypes specified for the table"""
221        return self._table.specified_dtypes
222
223    def to_arrow(self) -> pa.RecordBatch:
224        """Converts the table to a pyarrow `RecordBatch`
225
226        Requires the `pyarrow` extra to be installed.
227        """
228        if not _PYARROW_AVAILABLE:
229            raise ImportError(
230                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
231            )
232        return self._table.to_arrow()
233
234    def to_pandas(self) -> pd.DataFrame:
235        """Converts the table to a Pandas `DataFrame`.
236
237        Requires the `pandas` extra to be installed.
238        """
239        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
240        # which we don't implement. Using pyarrow conversion for now.
241        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
242        return self.to_arrow().to_pandas()
243
244    def to_polars(self) -> pl.DataFrame:
245        """Converts the table to a Polars `DataFrame`.
246
247        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
248        Requires the `polars` extra to be installed.
249        """
250        import polars as pl
251
252        return pl.DataFrame(self)
253
254    def __arrow_c_schema__(self) -> object:
255        """Export the schema as an `ArrowSchema` `PyCapsule`.
256
257        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
258
259        The Arrow PyCapsule Interface enables zero-copy data exchange with
260        Arrow-compatible libraries without requiring PyArrow as a dependency.
261        """
262        return self._table.__arrow_c_schema__()
263
264    def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
265        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
266
267        The optional `requested_schema` parameter allows for potential schema conversion.
268
269        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
270
271        The Arrow PyCapsule Interface enables zero-copy data exchange with
272        Arrow-compatible libraries without requiring PyArrow as a dependency.
273        """
274        return self._table.__arrow_c_array__(requested_schema)
275
276
277class ExcelReader:
278    """A class representing an open Excel file and allowing to read its sheets"""
279
280    def __init__(self, reader: _ExcelReader) -> None:
281        self._reader = reader
282
283    @property
284    def sheet_names(self) -> list[str]:
285        """The list of sheet names"""
286        return self._reader.sheet_names
287
288    @typing.overload
289    def load_sheet(
290        self,
291        idx_or_name: int | str,
292        *,
293        header_row: int | None = 0,
294        column_names: list[str] | None = None,
295        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
296        n_rows: int | None = None,
297        schema_sample_rows: int | None = 1_000,
298        dtype_coercion: Literal["coerce", "strict"] = "coerce",
299        use_columns: list[str]
300        | list[int]
301        | str
302        | Callable[[ColumnInfoNoDtype], bool]
303        | None = None,
304        dtypes: DType | DTypeMap | None = None,
305        eager: Literal[False] = ...,
306        skip_whitespace_tail_rows: bool = False,
307        whitespace_as_null: bool = False,
308    ) -> ExcelSheet: ...
309
310    @typing.overload
311    def load_sheet(
312        self,
313        idx_or_name: int | str,
314        *,
315        header_row: int | None = 0,
316        column_names: list[str] | None = None,
317        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
318        n_rows: int | None = None,
319        schema_sample_rows: int | None = 1_000,
320        dtype_coercion: Literal["coerce", "strict"] = "coerce",
321        use_columns: list[str]
322        | list[int]
323        | str
324        | Callable[[ColumnInfoNoDtype], bool]
325        | None = None,
326        dtypes: DType | DTypeMap | None = None,
327        eager: Literal[True] = ...,
328        skip_whitespace_tail_rows: bool = False,
329        whitespace_as_null: bool = False,
330    ) -> pa.RecordBatch: ...
331
332    def load_sheet(
333        self,
334        idx_or_name: int | str,
335        *,
336        header_row: int | None = 0,
337        column_names: list[str] | None = None,
338        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
339        n_rows: int | None = None,
340        schema_sample_rows: int | None = 1_000,
341        dtype_coercion: Literal["coerce", "strict"] = "coerce",
342        use_columns: list[str]
343        | list[int]
344        | str
345        | Callable[[ColumnInfoNoDtype], bool]
346        | None = None,
347        dtypes: DType | DTypeMap | None = None,
348        eager: bool = False,
349        skip_whitespace_tail_rows: bool = False,
350        whitespace_as_null: bool = False,
351    ) -> ExcelSheet | pa.RecordBatch:
352        """Loads a sheet by index or name.
353
354        :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
355        :param header_row: The index of the row containing the column labels, default index is 0.
356                           If `None`, the sheet does not have any column labels.
357                           Any rows before the `header_row` will be automatically skipped.
358        :param column_names: Overrides headers found in the document.
359                             If `column_names` is used, `header_row` will be ignored.
360        :param n_rows: Specifies how many rows should be loaded.
361                       If `None`, all rows are loaded
362        :param skip_rows: Specifies which rows should be skipped after the `header_row`.
363                          Any rows before the `header_row` are automatically skipped.
364                          It means row indices are relative to data rows, not the sheet!
365                          Can be one of:
366                          - `int`: Skip this many rows after the header row
367                          - `list[int]`: Skip specific row indices (0-based relative to data rows)
368                          - `Callable[[int], bool]`: Function that receives row index (0-based
369                          relative to data rows) and returns True to skip the row
370                          - `None`: If `header_row` is None, skips empty rows at beginning
371        :param schema_sample_rows: Specifies how many rows should be used to determine
372                                   the dtype of a column. Cannot be 0. A specific dtype can be
373                                   enforced for some or all columns through the `dtypes` parameter.
374                                   If `None`, all rows will be used.
375        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
376                               will try to coerce different dtypes in a column to the same one,
377                               whereas `strict` will raise an error in case a column contains
378                               several dtypes. Note that this only applies to columns whose dtype
379                               is guessed, i.e. not specified via `dtypes`.
380        :param use_columns: Specifies the columns to use. Can either be:
381                            - `None` to select all columns
382                            - A list of strings and ints, the column names and/or indices
383                              (starting at 0)
384                            - A string, a comma separated list of Excel column letters and column
385                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
386                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
387                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
388                              ranges (e.g. `":C"` to select columns from A to C). These can be
389                              combined for "except" patterns (e.g. `":C,E:"` to select everything
390                              except column D)
391                            - A callable, a function that takes a column and returns a boolean
392                              indicating whether the column should be used
393        :param dtypes: An optional dtype (for all columns)
394                       or dict of dtypes with keys as column indices or names.
395        :param eager: Specifies whether the sheet should be loaded eagerly.
396                      `False` (default) will load the sheet lazily using the `PyCapsule` interface,
397                      whereas `True` will load it eagerly via `pyarrow`.
398
399                      Eager loading requires the `pyarrow` extra to be installed.
400        :param skip_whitespace_tail_rows: Skip rows at the end of the sheet
401                                          containing only whitespace and null values.
402        :param whitespace_as_null: Consider cells containing only whitespace as null values.
403        """
404        sheet_or_rb = self._reader.load_sheet(
405            idx_or_name=idx_or_name,
406            header_row=header_row,
407            column_names=column_names,
408            skip_rows=skip_rows,
409            n_rows=n_rows,
410            schema_sample_rows=schema_sample_rows,
411            dtype_coercion=dtype_coercion,
412            use_columns=use_columns,
413            dtypes=dtypes,
414            eager=eager,
415            skip_whitespace_tail_rows=skip_whitespace_tail_rows,
416            whitespace_as_null=whitespace_as_null,
417        )
418        return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
419
420    def table_names(self, sheet_name: str | None = None) -> list[str]:
421        """The list of table names.
422
423        Will return an empty list if no tables are found.
424
425        :param sheet_name: If given, will limit the list to the given sheet, will be faster
426        too.
427        """
428        return self._reader.table_names(sheet_name)
429
430    def defined_names(self) -> list[DefinedName]:
431        """The list of defined names (named ranges) in the workbook.
432
433        Returns a list of DefinedName objects with 'name' and 'formula' attributes.
434        The formula is a string representation of the range or expression.
435
436        Will return an empty list if no defined names are found.
437        """
438        return self._reader.defined_names()
439
440    @typing.overload
441    def load_table(
442        self,
443        name: str,
444        *,
445        header_row: int | None = None,
446        column_names: list[str] | None = None,
447        skip_rows: int | None = None,
448        n_rows: int | None = None,
449        schema_sample_rows: int | None = 1_000,
450        dtype_coercion: Literal["coerce", "strict"] = "coerce",
451        use_columns: list[str]
452        | list[int]
453        | str
454        | Callable[[ColumnInfoNoDtype], bool]
455        | None = None,
456        dtypes: DType | DTypeMap | None = None,
457        eager: Literal[False] = ...,
458        skip_whitespace_tail_rows: bool = False,
459        whitespace_as_null: bool = False,
460    ) -> ExcelTable: ...
461
462    @typing.overload
463    def load_table(
464        self,
465        name: str,
466        *,
467        header_row: int | None = None,
468        column_names: list[str] | None = None,
469        skip_rows: int | None = None,
470        n_rows: int | None = None,
471        schema_sample_rows: int | None = 1_000,
472        dtype_coercion: Literal["coerce", "strict"] = "coerce",
473        use_columns: list[str]
474        | list[int]
475        | str
476        | Callable[[ColumnInfoNoDtype], bool]
477        | None = None,
478        dtypes: DType | DTypeMap | None = None,
479        eager: Literal[True] = ...,
480        skip_whitespace_tail_rows: bool = False,
481        whitespace_as_null: bool = False,
482    ) -> pa.RecordBatch: ...
483
484    def load_table(
485        self,
486        name: str,
487        *,
488        header_row: int | None = None,
489        column_names: list[str] | None = None,
490        skip_rows: int | None = None,
491        n_rows: int | None = None,
492        schema_sample_rows: int | None = 1_000,
493        dtype_coercion: Literal["coerce", "strict"] = "coerce",
494        use_columns: list[str]
495        | list[int]
496        | str
497        | Callable[[ColumnInfoNoDtype], bool]
498        | None = None,
499        dtypes: DType | DTypeMap | None = None,
500        eager: bool = False,
501        skip_whitespace_tail_rows: bool = False,
502        whitespace_as_null: bool = False,
503    ) -> ExcelTable | pa.RecordBatch:
504        """Loads a table by name.
505
506        :param name: The name of the table to load.
507        :param header_row: The index of the row containing the column labels.
508                           If `None`, the table's column names will be used.
509                           Any rows before the `header_row` will be automatically skipped.
510        :param column_names: Overrides headers found in the document.
511                             If `column_names` is used, `header_row` will be ignored.
512        :param n_rows: Specifies how many rows should be loaded.
513                       If `None`, all rows are loaded
514        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
515                          Any rows before the `header_row` are automatically skipped.
516                          If `header_row` is `None`, it skips the number of rows from the
517                          start of the sheet.
518        :param schema_sample_rows: Specifies how many rows should be used to determine
519                                   the dtype of a column. Cannot be 0. A specific dtype can be
520                                   enforced for some or all columns through the `dtypes` parameter.
521                                   If `None`, all rows will be used.
522        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
523                               will try to coerce different dtypes in a column to the same one,
524                               whereas `strict` will raise an error in case a column contains
525                               several dtypes. Note that this only applies to columns whose dtype
526                               is guessed, i.e. not specified via `dtypes`.
527        :param use_columns: Specifies the columns to use. Can either be:
528                            - `None` to select all columns
529                            - A list of strings and ints, the column names and/or indices
530                              (starting at 0)
531                            - A string, a comma separated list of Excel column letters and column
532                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
533                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
534                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
535                              ranges (e.g. `":C"` to select columns from A to C). These can be
536                              combined for "except" patterns (e.g. `":C,E:"` to select everything
537                              except column D)
538                            - A callable, a function that takes a column and returns a boolean
539                              indicating whether the column should be used
540        :param dtypes: An optional dtype (for all columns)
541                       or dict of dtypes with keys as column indices or names.
542        :param eager: Specifies whether the table should be loaded eagerly.
543                      `False` (default) will load the table lazily using the `PyCapsule` interface,
544                      whereas `True` will load it eagerly via `pyarrow`.
545
546                      Eager loading requires the `pyarrow` extra to be installed.
547        :param skip_whitespace_tail_rows: Skip rows at the end of the table
548                                          containing only whitespace and null values.
549        :param whitespace_as_null: Consider cells containing only whitespace as null values.
550        """
551        if eager:
552            return self._reader.load_table(
553                name=name,
554                header_row=header_row,
555                column_names=column_names,
556                skip_rows=skip_rows,
557                n_rows=n_rows,
558                schema_sample_rows=schema_sample_rows,
559                dtype_coercion=dtype_coercion,
560                use_columns=use_columns,
561                dtypes=dtypes,
562                eager=True,
563                skip_whitespace_tail_rows=skip_whitespace_tail_rows,
564                whitespace_as_null=whitespace_as_null,
565            )
566        else:
567            return ExcelTable(
568                self._reader.load_table(
569                    name=name,
570                    header_row=header_row,
571                    column_names=column_names,
572                    skip_rows=skip_rows,
573                    n_rows=n_rows,
574                    schema_sample_rows=schema_sample_rows,
575                    dtype_coercion=dtype_coercion,
576                    use_columns=use_columns,
577                    dtypes=dtypes,
578                    eager=False,
579                    skip_whitespace_tail_rows=skip_whitespace_tail_rows,
580                    whitespace_as_null=whitespace_as_null,
581                )
582            )
583
584    def load_sheet_eager(
585        self,
586        idx_or_name: int | str,
587        *,
588        header_row: int | None = 0,
589        column_names: list[str] | None = None,
590        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
591        n_rows: int | None = None,
592        schema_sample_rows: int | None = 1_000,
593        dtype_coercion: Literal["coerce", "strict"] = "coerce",
594        use_columns: list[str] | list[int] | str | None = None,
595        dtypes: DType | DTypeMap | None = None,
596    ) -> pa.RecordBatch:
597        """Loads a sheet eagerly by index or name.
598
599        For xlsx files, this will be faster and more memory-efficient, as it will use
600        `worksheet_range_ref` under the hood, which returns borrowed types.
601
602        Refer to `load_sheet` for parameter documentation
603
604        Requires the `pyarrow` extra to be installed.
605        """
606        return self._reader.load_sheet(
607            idx_or_name=idx_or_name,
608            header_row=header_row,
609            column_names=column_names,
610            skip_rows=skip_rows,
611            n_rows=n_rows,
612            schema_sample_rows=schema_sample_rows,
613            dtype_coercion=dtype_coercion,
614            use_columns=use_columns,
615            dtypes=dtypes,
616            eager=True,
617        )
618
619    def load_sheet_by_name(
620        self,
621        name: str,
622        *,
623        header_row: int | None = 0,
624        column_names: list[str] | None = None,
625        skip_rows: int | None = None,
626        n_rows: int | None = None,
627        schema_sample_rows: int | None = 1_000,
628        dtype_coercion: Literal["coerce", "strict"] = "coerce",
629        use_columns: list[str]
630        | list[int]
631        | str
632        | Callable[[ColumnInfoNoDtype], bool]
633        | None = None,
634        dtypes: DType | DTypeMap | None = None,
635    ) -> ExcelSheet:
636        """Loads a sheet by name.
637
638        Refer to `load_sheet` for parameter documentation
639        """
640        return self.load_sheet(
641            name,
642            header_row=header_row,
643            column_names=column_names,
644            skip_rows=skip_rows,
645            n_rows=n_rows,
646            schema_sample_rows=schema_sample_rows,
647            dtype_coercion=dtype_coercion,
648            use_columns=use_columns,
649            dtypes=dtypes,
650        )
651
652    def load_sheet_by_idx(
653        self,
654        idx: int,
655        *,
656        header_row: int | None = 0,
657        column_names: list[str] | None = None,
658        skip_rows: int | None = None,
659        n_rows: int | None = None,
660        schema_sample_rows: int | None = 1_000,
661        dtype_coercion: Literal["coerce", "strict"] = "coerce",
662        use_columns: list[str]
663        | list[int]
664        | str
665        | Callable[[ColumnInfoNoDtype], bool]
666        | None = None,
667        dtypes: DType | DTypeMap | None = None,
668    ) -> ExcelSheet:
669        """Loads a sheet by index.
670
671        Refer to `load_sheet` for parameter documentation
672        """
673        return self.load_sheet(
674            idx,
675            header_row=header_row,
676            column_names=column_names,
677            skip_rows=skip_rows,
678            n_rows=n_rows,
679            schema_sample_rows=schema_sample_rows,
680            dtype_coercion=dtype_coercion,
681            use_columns=use_columns,
682            dtypes=dtypes,
683        )
684
685    def __repr__(self) -> str:
686        return self._reader.__repr__()
687
688
689def read_excel(source: Path | str | bytes) -> ExcelReader:
690    """Opens and loads an excel file.
691
692    :param source: The path to a file or its content as bytes
693    """
694    if isinstance(source, str | Path):
695        source = expanduser(source)
696    return ExcelReader(_read_excel(source))
697
698
699__all__ = (
700    # version
701    "__version__",
702    # main entrypoint
703    "read_excel",
704    # Python types
705    "DType",
706    "DTypeMap",
707    # Excel reader
708    "ExcelReader",
709    # Excel sheet
710    "ExcelSheet",
711    # Excel table
712    "ExcelTable",
713    # Column metadata
714    "DTypeFrom",
715    "ColumnNameFrom",
716    "ColumnInfo",
717    # Defined names
718    "DefinedName",
719    # Parse error information
720    "CellError",
721    "CellErrors",
722    # Exceptions
723    "FastExcelError",
724    "CannotRetrieveCellDataError",
725    "CalamineCellError",
726    "CalamineError",
727    "SheetNotFoundError",
728    "ColumnNotFoundError",
729    "ArrowError",
730    "InvalidParametersError",
731    "UnsupportedColumnTypeCombinationError",
732)
__version__ = '0.18.0'
def read_excel(source: pathlib.Path | str | bytes) -> ExcelReader:
690def read_excel(source: Path | str | bytes) -> ExcelReader:
691    """Opens and loads an excel file.
692
693    :param source: The path to a file or its content as bytes
694    """
695    if isinstance(source, str | Path):
696        source = expanduser(source)
697    return ExcelReader(_read_excel(source))

Opens and loads an excel file.

Parameters
  • source: The path to a file or its content as bytes
DType = typing.Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']
DTypeMap: TypeAlias = 'dict[str | int, DType]'
class ExcelReader:
278class ExcelReader:
279    """A class representing an open Excel file and allowing to read its sheets"""
280
281    def __init__(self, reader: _ExcelReader) -> None:
282        self._reader = reader
283
284    @property
285    def sheet_names(self) -> list[str]:
286        """The list of sheet names"""
287        return self._reader.sheet_names
288
289    @typing.overload
290    def load_sheet(
291        self,
292        idx_or_name: int | str,
293        *,
294        header_row: int | None = 0,
295        column_names: list[str] | None = None,
296        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
297        n_rows: int | None = None,
298        schema_sample_rows: int | None = 1_000,
299        dtype_coercion: Literal["coerce", "strict"] = "coerce",
300        use_columns: list[str]
301        | list[int]
302        | str
303        | Callable[[ColumnInfoNoDtype], bool]
304        | None = None,
305        dtypes: DType | DTypeMap | None = None,
306        eager: Literal[False] = ...,
307        skip_whitespace_tail_rows: bool = False,
308        whitespace_as_null: bool = False,
309    ) -> ExcelSheet: ...
310
311    @typing.overload
312    def load_sheet(
313        self,
314        idx_or_name: int | str,
315        *,
316        header_row: int | None = 0,
317        column_names: list[str] | None = None,
318        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
319        n_rows: int | None = None,
320        schema_sample_rows: int | None = 1_000,
321        dtype_coercion: Literal["coerce", "strict"] = "coerce",
322        use_columns: list[str]
323        | list[int]
324        | str
325        | Callable[[ColumnInfoNoDtype], bool]
326        | None = None,
327        dtypes: DType | DTypeMap | None = None,
328        eager: Literal[True] = ...,
329        skip_whitespace_tail_rows: bool = False,
330        whitespace_as_null: bool = False,
331    ) -> pa.RecordBatch: ...
332
333    def load_sheet(
334        self,
335        idx_or_name: int | str,
336        *,
337        header_row: int | None = 0,
338        column_names: list[str] | None = None,
339        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
340        n_rows: int | None = None,
341        schema_sample_rows: int | None = 1_000,
342        dtype_coercion: Literal["coerce", "strict"] = "coerce",
343        use_columns: list[str]
344        | list[int]
345        | str
346        | Callable[[ColumnInfoNoDtype], bool]
347        | None = None,
348        dtypes: DType | DTypeMap | None = None,
349        eager: bool = False,
350        skip_whitespace_tail_rows: bool = False,
351        whitespace_as_null: bool = False,
352    ) -> ExcelSheet | pa.RecordBatch:
353        """Loads a sheet by index or name.
354
355        :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
356        :param header_row: The index of the row containing the column labels, default index is 0.
357                           If `None`, the sheet does not have any column labels.
358                           Any rows before the `header_row` will be automatically skipped.
359        :param column_names: Overrides headers found in the document.
360                             If `column_names` is used, `header_row` will be ignored.
361        :param n_rows: Specifies how many rows should be loaded.
362                       If `None`, all rows are loaded
363        :param skip_rows: Specifies which rows should be skipped after the `header_row`.
364                          Any rows before the `header_row` are automatically skipped.
365                          It means row indices are relative to data rows, not the sheet!
366                          Can be one of:
367                          - `int`: Skip this many rows after the header row
368                          - `list[int]`: Skip specific row indices (0-based relative to data rows)
369                          - `Callable[[int], bool]`: Function that receives row index (0-based
370                          relative to data rows) and returns True to skip the row
371                          - `None`: If `header_row` is None, skips empty rows at beginning
372        :param schema_sample_rows: Specifies how many rows should be used to determine
373                                   the dtype of a column. Cannot be 0. A specific dtype can be
374                                   enforced for some or all columns through the `dtypes` parameter.
375                                   If `None`, all rows will be used.
376        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
377                               will try to coerce different dtypes in a column to the same one,
378                               whereas `strict` will raise an error in case a column contains
379                               several dtypes. Note that this only applies to columns whose dtype
380                               is guessed, i.e. not specified via `dtypes`.
381        :param use_columns: Specifies the columns to use. Can either be:
382                            - `None` to select all columns
383                            - A list of strings and ints, the column names and/or indices
384                              (starting at 0)
385                            - A string, a comma separated list of Excel column letters and column
386                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
387                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
388                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
389                              ranges (e.g. `":C"` to select columns from A to C). These can be
390                              combined for "except" patterns (e.g. `":C,E:"` to select everything
391                              except column D)
392                            - A callable, a function that takes a column and returns a boolean
393                              indicating whether the column should be used
394        :param dtypes: An optional dtype (for all columns)
395                       or dict of dtypes with keys as column indices or names.
396        :param eager: Specifies whether the sheet should be loaded eagerly.
397                      `False` (default) will load the sheet lazily using the `PyCapsule` interface,
398                      whereas `True` will load it eagerly via `pyarrow`.
399
400                      Eager loading requires the `pyarrow` extra to be installed.
401        :param skip_whitespace_tail_rows: Skip rows at the end of the sheet
402                                          containing only whitespace and null values.
403        :param whitespace_as_null: Consider cells containing only whitespace as null values.
404        """
405        sheet_or_rb = self._reader.load_sheet(
406            idx_or_name=idx_or_name,
407            header_row=header_row,
408            column_names=column_names,
409            skip_rows=skip_rows,
410            n_rows=n_rows,
411            schema_sample_rows=schema_sample_rows,
412            dtype_coercion=dtype_coercion,
413            use_columns=use_columns,
414            dtypes=dtypes,
415            eager=eager,
416            skip_whitespace_tail_rows=skip_whitespace_tail_rows,
417            whitespace_as_null=whitespace_as_null,
418        )
419        return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
420
421    def table_names(self, sheet_name: str | None = None) -> list[str]:
422        """The list of table names.
423
424        Will return an empty list if no tables are found.
425
426        :param sheet_name: If given, will limit the list to the given sheet, will be faster
427        too.
428        """
429        return self._reader.table_names(sheet_name)
430
431    def defined_names(self) -> list[DefinedName]:
432        """The list of defined names (named ranges) in the workbook.
433
434        Returns a list of DefinedName objects with 'name' and 'formula' attributes.
435        The formula is a string representation of the range or expression.
436
437        Will return an empty list if no defined names are found.
438        """
439        return self._reader.defined_names()
440
441    @typing.overload
442    def load_table(
443        self,
444        name: str,
445        *,
446        header_row: int | None = None,
447        column_names: list[str] | None = None,
448        skip_rows: int | None = None,
449        n_rows: int | None = None,
450        schema_sample_rows: int | None = 1_000,
451        dtype_coercion: Literal["coerce", "strict"] = "coerce",
452        use_columns: list[str]
453        | list[int]
454        | str
455        | Callable[[ColumnInfoNoDtype], bool]
456        | None = None,
457        dtypes: DType | DTypeMap | None = None,
458        eager: Literal[False] = ...,
459        skip_whitespace_tail_rows: bool = False,
460        whitespace_as_null: bool = False,
461    ) -> ExcelTable: ...
462
463    @typing.overload
464    def load_table(
465        self,
466        name: str,
467        *,
468        header_row: int | None = None,
469        column_names: list[str] | None = None,
470        skip_rows: int | None = None,
471        n_rows: int | None = None,
472        schema_sample_rows: int | None = 1_000,
473        dtype_coercion: Literal["coerce", "strict"] = "coerce",
474        use_columns: list[str]
475        | list[int]
476        | str
477        | Callable[[ColumnInfoNoDtype], bool]
478        | None = None,
479        dtypes: DType | DTypeMap | None = None,
480        eager: Literal[True] = ...,
481        skip_whitespace_tail_rows: bool = False,
482        whitespace_as_null: bool = False,
483    ) -> pa.RecordBatch: ...
484
485    def load_table(
486        self,
487        name: str,
488        *,
489        header_row: int | None = None,
490        column_names: list[str] | None = None,
491        skip_rows: int | None = None,
492        n_rows: int | None = None,
493        schema_sample_rows: int | None = 1_000,
494        dtype_coercion: Literal["coerce", "strict"] = "coerce",
495        use_columns: list[str]
496        | list[int]
497        | str
498        | Callable[[ColumnInfoNoDtype], bool]
499        | None = None,
500        dtypes: DType | DTypeMap | None = None,
501        eager: bool = False,
502        skip_whitespace_tail_rows: bool = False,
503        whitespace_as_null: bool = False,
504    ) -> ExcelTable | pa.RecordBatch:
505        """Loads a table by name.
506
507        :param name: The name of the table to load.
508        :param header_row: The index of the row containing the column labels.
509                           If `None`, the table's column names will be used.
510                           Any rows before the `header_row` will be automatically skipped.
511        :param column_names: Overrides headers found in the document.
512                             If `column_names` is used, `header_row` will be ignored.
513        :param n_rows: Specifies how many rows should be loaded.
514                       If `None`, all rows are loaded
515        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
516                          Any rows before the `header_row` are automatically skipped.
517                          If `header_row` is `None`, it skips the number of rows from the
518                          start of the sheet.
519        :param schema_sample_rows: Specifies how many rows should be used to determine
520                                   the dtype of a column. Cannot be 0. A specific dtype can be
521                                   enforced for some or all columns through the `dtypes` parameter.
522                                   If `None`, all rows will be used.
523        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
524                               will try to coerce different dtypes in a column to the same one,
525                               whereas `strict` will raise an error in case a column contains
526                               several dtypes. Note that this only applies to columns whose dtype
527                               is guessed, i.e. not specified via `dtypes`.
528        :param use_columns: Specifies the columns to use. Can either be:
529                            - `None` to select all columns
530                            - A list of strings and ints, the column names and/or indices
531                              (starting at 0)
532                            - A string, a comma separated list of Excel column letters and column
533                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
534                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
535                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
536                              ranges (e.g. `":C"` to select columns from A to C). These can be
537                              combined for "except" patterns (e.g. `":C,E:"` to select everything
538                              except column D)
539                            - A callable, a function that takes a column and returns a boolean
540                              indicating whether the column should be used
541        :param dtypes: An optional dtype (for all columns)
542                       or dict of dtypes with keys as column indices or names.
543        :param eager: Specifies whether the table should be loaded eagerly.
544                      `False` (default) will load the table lazily using the `PyCapsule` interface,
545                      whereas `True` will load it eagerly via `pyarrow`.
546
547                      Eager loading requires the `pyarrow` extra to be installed.
548        :param skip_whitespace_tail_rows: Skip rows at the end of the table
549                                          containing only whitespace and null values.
550        :param whitespace_as_null: Consider cells containing only whitespace as null values.
551        """
552        if eager:
553            return self._reader.load_table(
554                name=name,
555                header_row=header_row,
556                column_names=column_names,
557                skip_rows=skip_rows,
558                n_rows=n_rows,
559                schema_sample_rows=schema_sample_rows,
560                dtype_coercion=dtype_coercion,
561                use_columns=use_columns,
562                dtypes=dtypes,
563                eager=True,
564                skip_whitespace_tail_rows=skip_whitespace_tail_rows,
565                whitespace_as_null=whitespace_as_null,
566            )
567        else:
568            return ExcelTable(
569                self._reader.load_table(
570                    name=name,
571                    header_row=header_row,
572                    column_names=column_names,
573                    skip_rows=skip_rows,
574                    n_rows=n_rows,
575                    schema_sample_rows=schema_sample_rows,
576                    dtype_coercion=dtype_coercion,
577                    use_columns=use_columns,
578                    dtypes=dtypes,
579                    eager=False,
580                    skip_whitespace_tail_rows=skip_whitespace_tail_rows,
581                    whitespace_as_null=whitespace_as_null,
582                )
583            )
584
585    def load_sheet_eager(
586        self,
587        idx_or_name: int | str,
588        *,
589        header_row: int | None = 0,
590        column_names: list[str] | None = None,
591        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
592        n_rows: int | None = None,
593        schema_sample_rows: int | None = 1_000,
594        dtype_coercion: Literal["coerce", "strict"] = "coerce",
595        use_columns: list[str] | list[int] | str | None = None,
596        dtypes: DType | DTypeMap | None = None,
597    ) -> pa.RecordBatch:
598        """Loads a sheet eagerly by index or name.
599
600        For xlsx files, this will be faster and more memory-efficient, as it will use
601        `worksheet_range_ref` under the hood, which returns borrowed types.
602
603        Refer to `load_sheet` for parameter documentation
604
605        Requires the `pyarrow` extra to be installed.
606        """
607        return self._reader.load_sheet(
608            idx_or_name=idx_or_name,
609            header_row=header_row,
610            column_names=column_names,
611            skip_rows=skip_rows,
612            n_rows=n_rows,
613            schema_sample_rows=schema_sample_rows,
614            dtype_coercion=dtype_coercion,
615            use_columns=use_columns,
616            dtypes=dtypes,
617            eager=True,
618        )
619
620    def load_sheet_by_name(
621        self,
622        name: str,
623        *,
624        header_row: int | None = 0,
625        column_names: list[str] | None = None,
626        skip_rows: int | None = None,
627        n_rows: int | None = None,
628        schema_sample_rows: int | None = 1_000,
629        dtype_coercion: Literal["coerce", "strict"] = "coerce",
630        use_columns: list[str]
631        | list[int]
632        | str
633        | Callable[[ColumnInfoNoDtype], bool]
634        | None = None,
635        dtypes: DType | DTypeMap | None = None,
636    ) -> ExcelSheet:
637        """Loads a sheet by name.
638
639        Refer to `load_sheet` for parameter documentation
640        """
641        return self.load_sheet(
642            name,
643            header_row=header_row,
644            column_names=column_names,
645            skip_rows=skip_rows,
646            n_rows=n_rows,
647            schema_sample_rows=schema_sample_rows,
648            dtype_coercion=dtype_coercion,
649            use_columns=use_columns,
650            dtypes=dtypes,
651        )
652
653    def load_sheet_by_idx(
654        self,
655        idx: int,
656        *,
657        header_row: int | None = 0,
658        column_names: list[str] | None = None,
659        skip_rows: int | None = None,
660        n_rows: int | None = None,
661        schema_sample_rows: int | None = 1_000,
662        dtype_coercion: Literal["coerce", "strict"] = "coerce",
663        use_columns: list[str]
664        | list[int]
665        | str
666        | Callable[[ColumnInfoNoDtype], bool]
667        | None = None,
668        dtypes: DType | DTypeMap | None = None,
669    ) -> ExcelSheet:
670        """Loads a sheet by index.
671
672        Refer to `load_sheet` for parameter documentation
673        """
674        return self.load_sheet(
675            idx,
676            header_row=header_row,
677            column_names=column_names,
678            skip_rows=skip_rows,
679            n_rows=n_rows,
680            schema_sample_rows=schema_sample_rows,
681            dtype_coercion=dtype_coercion,
682            use_columns=use_columns,
683            dtypes=dtypes,
684        )
685
686    def __repr__(self) -> str:
687        return self._reader.__repr__()

A class representing an open Excel file and allowing to read its sheets

ExcelReader(reader: _ExcelReader)
281    def __init__(self, reader: _ExcelReader) -> None:
282        self._reader = reader
sheet_names: list[str]
284    @property
285    def sheet_names(self) -> list[str]:
286        """The list of sheet names"""
287        return self._reader.sheet_names

The list of sheet names

def load_sheet( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None, eager: bool = False, skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False) -> ExcelSheet | pyarrow.lib.RecordBatch:
333    def load_sheet(
334        self,
335        idx_or_name: int | str,
336        *,
337        header_row: int | None = 0,
338        column_names: list[str] | None = None,
339        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
340        n_rows: int | None = None,
341        schema_sample_rows: int | None = 1_000,
342        dtype_coercion: Literal["coerce", "strict"] = "coerce",
343        use_columns: list[str]
344        | list[int]
345        | str
346        | Callable[[ColumnInfoNoDtype], bool]
347        | None = None,
348        dtypes: DType | DTypeMap | None = None,
349        eager: bool = False,
350        skip_whitespace_tail_rows: bool = False,
351        whitespace_as_null: bool = False,
352    ) -> ExcelSheet | pa.RecordBatch:
353        """Loads a sheet by index or name.
354
355        :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
356        :param header_row: The index of the row containing the column labels, default index is 0.
357                           If `None`, the sheet does not have any column labels.
358                           Any rows before the `header_row` will be automatically skipped.
359        :param column_names: Overrides headers found in the document.
360                             If `column_names` is used, `header_row` will be ignored.
361        :param n_rows: Specifies how many rows should be loaded.
362                       If `None`, all rows are loaded
363        :param skip_rows: Specifies which rows should be skipped after the `header_row`.
364                          Any rows before the `header_row` are automatically skipped.
365                          It means row indices are relative to data rows, not the sheet!
366                          Can be one of:
367                          - `int`: Skip this many rows after the header row
368                          - `list[int]`: Skip specific row indices (0-based relative to data rows)
369                          - `Callable[[int], bool]`: Function that receives row index (0-based
370                          relative to data rows) and returns True to skip the row
371                          - `None`: If `header_row` is None, skips empty rows at beginning
372        :param schema_sample_rows: Specifies how many rows should be used to determine
373                                   the dtype of a column. Cannot be 0. A specific dtype can be
374                                   enforced for some or all columns through the `dtypes` parameter.
375                                   If `None`, all rows will be used.
376        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
377                               will try to coerce different dtypes in a column to the same one,
378                               whereas `strict` will raise an error in case a column contains
379                               several dtypes. Note that this only applies to columns whose dtype
380                               is guessed, i.e. not specified via `dtypes`.
381        :param use_columns: Specifies the columns to use. Can either be:
382                            - `None` to select all columns
383                            - A list of strings and ints, the column names and/or indices
384                              (starting at 0)
385                            - A string, a comma separated list of Excel column letters and column
386                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
387                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
388                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
389                              ranges (e.g. `":C"` to select columns from A to C). These can be
390                              combined for "except" patterns (e.g. `":C,E:"` to select everything
391                              except column D)
392                            - A callable, a function that takes a column and returns a boolean
393                              indicating whether the column should be used
394        :param dtypes: An optional dtype (for all columns)
395                       or dict of dtypes with keys as column indices or names.
396        :param eager: Specifies whether the sheet should be loaded eagerly.
397                      `False` (default) will load the sheet lazily using the `PyCapsule` interface,
398                      whereas `True` will load it eagerly via `pyarrow`.
399
400                      Eager loading requires the `pyarrow` extra to be installed.
401        :param skip_whitespace_tail_rows: Skip rows at the end of the sheet
402                                          containing only whitespace and null values.
403        :param whitespace_as_null: Consider cells containing only whitespace as null values.
404        """
405        sheet_or_rb = self._reader.load_sheet(
406            idx_or_name=idx_or_name,
407            header_row=header_row,
408            column_names=column_names,
409            skip_rows=skip_rows,
410            n_rows=n_rows,
411            schema_sample_rows=schema_sample_rows,
412            dtype_coercion=dtype_coercion,
413            use_columns=use_columns,
414            dtypes=dtypes,
415            eager=eager,
416            skip_whitespace_tail_rows=skip_whitespace_tail_rows,
417            whitespace_as_null=whitespace_as_null,
418        )
419        return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)

Loads a sheet by index or name.

Parameters
  • idx_or_name: The index (starting at 0) or the name of the sheet to load.
  • header_row: The index of the row containing the column labels, default index is 0. If None, the sheet does not have any column labels. Any rows before the header_row will be automatically skipped.
  • column_names: Overrides headers found in the document. If column_names is used, header_row will be ignored.
  • n_rows: Specifies how many rows should be loaded. If None, all rows are loaded
  • skip_rows: Specifies which rows should be skipped after the header_row. Any rows before the header_row are automatically skipped. It means row indices are relative to data rows, not the sheet! Can be one of:
    • int: Skip this many rows after the header row
    • list[int]: Skip specific row indices (0-based relative to data rows)
    • Callable[[int], bool]: Function that receives row index (0-based relative to data rows) and returns True to skip the row
    • None: If header_row is None, skips empty rows at beginning
  • schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. Cannot be 0. A specific dtype can be enforced for some or all columns through the dtypes parameter. If None, all rows will be used.
  • dtype_coercion: Specifies how type coercion should behave. coerce (the default) will try to coerce different dtypes in a column to the same one, whereas strict will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified via dtypes.
  • use_columns: Specifies the columns to use. Can either be:
    • None to select all columns
    • A list of strings and ints, the column names and/or indices (starting at 0)
    • A string, a comma separated list of Excel column letters and column ranges (e.g. "A:E" or "A,C,E:F", which would result in A,B,C,D,E and A,C,E,F). Also supports open-ended ranges (e.g. "B:" to select all columns from B onwards) and from-beginning ranges (e.g. ":C" to select columns from A to C). These can be combined for "except" patterns (e.g. ":C,E:" to select everything except column D)
    • A callable, a function that takes a column and returns a boolean indicating whether the column should be used
  • dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
  • eager: Specifies whether the sheet should be loaded eagerly. False (default) will load the sheet lazily using the PyCapsule interface, whereas True will load it eagerly via pyarrow.

          Eager loading requires the `pyarrow` extra to be installed.
    
  • skip_whitespace_tail_rows: Skip rows at the end of the sheet containing only whitespace and null values.
  • whitespace_as_null: Consider cells containing only whitespace as null values.
def table_names(self, sheet_name: str | None = None) -> list[str]:
421    def table_names(self, sheet_name: str | None = None) -> list[str]:
422        """The list of table names.
423
424        Will return an empty list if no tables are found.
425
426        :param sheet_name: If given, will limit the list to the given sheet, will be faster
427        too.
428        """
429        return self._reader.table_names(sheet_name)

The list of table names.

Will return an empty list if no tables are found.

Parameters
  • sheet_name: If given, will limit the list to the given sheet, will be faster too.
def defined_names(self) -> list[DefinedName]:
431    def defined_names(self) -> list[DefinedName]:
432        """The list of defined names (named ranges) in the workbook.
433
434        Returns a list of DefinedName objects with 'name' and 'formula' attributes.
435        The formula is a string representation of the range or expression.
436
437        Will return an empty list if no defined names are found.
438        """
439        return self._reader.defined_names()

The list of defined names (named ranges) in the workbook.

Returns a list of DefinedName objects with 'name' and 'formula' attributes. The formula is a string representation of the range or expression.

Will return an empty list if no defined names are found.

def load_table( self, name: str, *, header_row: int | None = None, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None, eager: bool = False, skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False) -> ExcelTable | pyarrow.lib.RecordBatch:
485    def load_table(
486        self,
487        name: str,
488        *,
489        header_row: int | None = None,
490        column_names: list[str] | None = None,
491        skip_rows: int | None = None,
492        n_rows: int | None = None,
493        schema_sample_rows: int | None = 1_000,
494        dtype_coercion: Literal["coerce", "strict"] = "coerce",
495        use_columns: list[str]
496        | list[int]
497        | str
498        | Callable[[ColumnInfoNoDtype], bool]
499        | None = None,
500        dtypes: DType | DTypeMap | None = None,
501        eager: bool = False,
502        skip_whitespace_tail_rows: bool = False,
503        whitespace_as_null: bool = False,
504    ) -> ExcelTable | pa.RecordBatch:
505        """Loads a table by name.
506
507        :param name: The name of the table to load.
508        :param header_row: The index of the row containing the column labels.
509                           If `None`, the table's column names will be used.
510                           Any rows before the `header_row` will be automatically skipped.
511        :param column_names: Overrides headers found in the document.
512                             If `column_names` is used, `header_row` will be ignored.
513        :param n_rows: Specifies how many rows should be loaded.
514                       If `None`, all rows are loaded
515        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
516                          Any rows before the `header_row` are automatically skipped.
517                          If `header_row` is `None`, it skips the number of rows from the
518                          start of the sheet.
519        :param schema_sample_rows: Specifies how many rows should be used to determine
520                                   the dtype of a column. Cannot be 0. A specific dtype can be
521                                   enforced for some or all columns through the `dtypes` parameter.
522                                   If `None`, all rows will be used.
523        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
524                               will try to coerce different dtypes in a column to the same one,
525                               whereas `strict` will raise an error in case a column contains
526                               several dtypes. Note that this only applies to columns whose dtype
527                               is guessed, i.e. not specified via `dtypes`.
528        :param use_columns: Specifies the columns to use. Can either be:
529                            - `None` to select all columns
530                            - A list of strings and ints, the column names and/or indices
531                              (starting at 0)
532                            - A string, a comma separated list of Excel column letters and column
533                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
534                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
535                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
536                              ranges (e.g. `":C"` to select columns from A to C). These can be
537                              combined for "except" patterns (e.g. `":C,E:"` to select everything
538                              except column D)
539                            - A callable, a function that takes a column and returns a boolean
540                              indicating whether the column should be used
541        :param dtypes: An optional dtype (for all columns)
542                       or dict of dtypes with keys as column indices or names.
543        :param eager: Specifies whether the table should be loaded eagerly.
544                      `False` (default) will load the table lazily using the `PyCapsule` interface,
545                      whereas `True` will load it eagerly via `pyarrow`.
546
547                      Eager loading requires the `pyarrow` extra to be installed.
548        :param skip_whitespace_tail_rows: Skip rows at the end of the table
549                                          containing only whitespace and null values.
550        :param whitespace_as_null: Consider cells containing only whitespace as null values.
551        """
552        if eager:
553            return self._reader.load_table(
554                name=name,
555                header_row=header_row,
556                column_names=column_names,
557                skip_rows=skip_rows,
558                n_rows=n_rows,
559                schema_sample_rows=schema_sample_rows,
560                dtype_coercion=dtype_coercion,
561                use_columns=use_columns,
562                dtypes=dtypes,
563                eager=True,
564                skip_whitespace_tail_rows=skip_whitespace_tail_rows,
565                whitespace_as_null=whitespace_as_null,
566            )
567        else:
568            return ExcelTable(
569                self._reader.load_table(
570                    name=name,
571                    header_row=header_row,
572                    column_names=column_names,
573                    skip_rows=skip_rows,
574                    n_rows=n_rows,
575                    schema_sample_rows=schema_sample_rows,
576                    dtype_coercion=dtype_coercion,
577                    use_columns=use_columns,
578                    dtypes=dtypes,
579                    eager=False,
580                    skip_whitespace_tail_rows=skip_whitespace_tail_rows,
581                    whitespace_as_null=whitespace_as_null,
582                )
583            )

Loads a table by name.

Parameters
  • name: The name of the table to load.
  • header_row: The index of the row containing the column labels. If None, the table's column names will be used. Any rows before the header_row will be automatically skipped.
  • column_names: Overrides headers found in the document. If column_names is used, header_row will be ignored.
  • n_rows: Specifies how many rows should be loaded. If None, all rows are loaded
  • skip_rows: Specifies how many rows should be skipped after the header_row. Any rows before the header_row are automatically skipped. If header_row is None, it skips the number of rows from the start of the sheet.
  • schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. Cannot be 0. A specific dtype can be enforced for some or all columns through the dtypes parameter. If None, all rows will be used.
  • dtype_coercion: Specifies how type coercion should behave. coerce (the default) will try to coerce different dtypes in a column to the same one, whereas strict will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified via dtypes.
  • use_columns: Specifies the columns to use. Can either be:
    • None to select all columns
    • A list of strings and ints, the column names and/or indices (starting at 0)
    • A string, a comma separated list of Excel column letters and column ranges (e.g. "A:E" or "A,C,E:F", which would result in A,B,C,D,E and A,C,E,F). Also supports open-ended ranges (e.g. "B:" to select all columns from B onwards) and from-beginning ranges (e.g. ":C" to select columns from A to C). These can be combined for "except" patterns (e.g. ":C,E:" to select everything except column D)
    • A callable, a function that takes a column and returns a boolean indicating whether the column should be used
  • dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
  • eager: Specifies whether the table should be loaded eagerly. False (default) will load the table lazily using the PyCapsule interface, whereas True will load it eagerly via pyarrow.

          Eager loading requires the `pyarrow` extra to be installed.
    
  • skip_whitespace_tail_rows: Skip rows at the end of the table containing only whitespace and null values.
  • whitespace_as_null: Consider cells containing only whitespace as null values.
def load_sheet_eager( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: list[str] | list[int] | str | None = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None) -> pyarrow.lib.RecordBatch:
585    def load_sheet_eager(
586        self,
587        idx_or_name: int | str,
588        *,
589        header_row: int | None = 0,
590        column_names: list[str] | None = None,
591        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
592        n_rows: int | None = None,
593        schema_sample_rows: int | None = 1_000,
594        dtype_coercion: Literal["coerce", "strict"] = "coerce",
595        use_columns: list[str] | list[int] | str | None = None,
596        dtypes: DType | DTypeMap | None = None,
597    ) -> pa.RecordBatch:
598        """Loads a sheet eagerly by index or name.
599
600        For xlsx files, this will be faster and more memory-efficient, as it will use
601        `worksheet_range_ref` under the hood, which returns borrowed types.
602
603        Refer to `load_sheet` for parameter documentation
604
605        Requires the `pyarrow` extra to be installed.
606        """
607        return self._reader.load_sheet(
608            idx_or_name=idx_or_name,
609            header_row=header_row,
610            column_names=column_names,
611            skip_rows=skip_rows,
612            n_rows=n_rows,
613            schema_sample_rows=schema_sample_rows,
614            dtype_coercion=dtype_coercion,
615            use_columns=use_columns,
616            dtypes=dtypes,
617            eager=True,
618        )

Loads a sheet eagerly by index or name.

For xlsx files, this will be faster and more memory-efficient, as it will use worksheet_range_ref under the hood, which returns borrowed types.

Refer to load_sheet for parameter documentation

Requires the pyarrow extra to be installed.

def load_sheet_by_name( self, name: str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None) -> ExcelSheet:
620    def load_sheet_by_name(
621        self,
622        name: str,
623        *,
624        header_row: int | None = 0,
625        column_names: list[str] | None = None,
626        skip_rows: int | None = None,
627        n_rows: int | None = None,
628        schema_sample_rows: int | None = 1_000,
629        dtype_coercion: Literal["coerce", "strict"] = "coerce",
630        use_columns: list[str]
631        | list[int]
632        | str
633        | Callable[[ColumnInfoNoDtype], bool]
634        | None = None,
635        dtypes: DType | DTypeMap | None = None,
636    ) -> ExcelSheet:
637        """Loads a sheet by name.
638
639        Refer to `load_sheet` for parameter documentation
640        """
641        return self.load_sheet(
642            name,
643            header_row=header_row,
644            column_names=column_names,
645            skip_rows=skip_rows,
646            n_rows=n_rows,
647            schema_sample_rows=schema_sample_rows,
648            dtype_coercion=dtype_coercion,
649            use_columns=use_columns,
650            dtypes=dtypes,
651        )

Loads a sheet by name.

Refer to load_sheet for parameter documentation

def load_sheet_by_idx( self, idx: int, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1000, dtype_coercion: Literal['coerce', 'strict'] = 'coerce', use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: Union[Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration'], dict[str | int, Literal['null', 'int', 'float', 'string', 'boolean', 'datetime', 'date', 'duration']], NoneType] = None) -> ExcelSheet:
653    def load_sheet_by_idx(
654        self,
655        idx: int,
656        *,
657        header_row: int | None = 0,
658        column_names: list[str] | None = None,
659        skip_rows: int | None = None,
660        n_rows: int | None = None,
661        schema_sample_rows: int | None = 1_000,
662        dtype_coercion: Literal["coerce", "strict"] = "coerce",
663        use_columns: list[str]
664        | list[int]
665        | str
666        | Callable[[ColumnInfoNoDtype], bool]
667        | None = None,
668        dtypes: DType | DTypeMap | None = None,
669    ) -> ExcelSheet:
670        """Loads a sheet by index.
671
672        Refer to `load_sheet` for parameter documentation
673        """
674        return self.load_sheet(
675            idx,
676            header_row=header_row,
677            column_names=column_names,
678            skip_rows=skip_rows,
679            n_rows=n_rows,
680            schema_sample_rows=schema_sample_rows,
681            dtype_coercion=dtype_coercion,
682            use_columns=use_columns,
683            dtypes=dtypes,
684        )

Loads a sheet by index.

Refer to load_sheet for parameter documentation

class ExcelSheet:
 55class ExcelSheet:
 56    """A class representing a single sheet in an Excel File"""
 57
 58    def __init__(self, sheet: _ExcelSheet) -> None:
 59        self._sheet = sheet
 60
 61    @property
 62    def name(self) -> str:
 63        """The name of the sheet"""
 64        return self._sheet.name
 65
 66    @property
 67    def width(self) -> int:
 68        """The sheet's width"""
 69        return self._sheet.width
 70
 71    @property
 72    def height(self) -> int:
 73        """The sheet's height, with `skip_rows` and `nrows` applied"""
 74        return self._sheet.height
 75
 76    @property
 77    def total_height(self) -> int:
 78        """The sheet's total height"""
 79        return self._sheet.total_height
 80
 81    @property
 82    def selected_columns(self) -> list[ColumnInfo]:
 83        """The sheet's selected columns"""
 84        return self._sheet.selected_columns
 85
 86    def available_columns(self) -> list[ColumnInfo]:
 87        """The columns available for the given sheet"""
 88        return self._sheet.available_columns()
 89
 90    @property
 91    def specified_dtypes(self) -> DTypeMap | None:
 92        """The dtypes specified for the sheet"""
 93        return self._sheet.specified_dtypes
 94
 95    @property
 96    def visible(self) -> SheetVisible:
 97        """The visibility of the sheet"""
 98        return self._sheet.visible
 99
100    def to_arrow(self) -> pa.RecordBatch:
101        """Converts the sheet to a pyarrow `RecordBatch`
102
103        Requires the `pyarrow` extra to be installed.
104        """
105        if not _PYARROW_AVAILABLE:
106            raise ImportError(
107                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
108            )
109        return self._sheet.to_arrow()
110
111    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
112        """Converts the sheet to a pyarrow `RecordBatch` with error information.
113
114        Stores the positions of any values that cannot be parsed as the specified type and were
115        therefore converted to None.
116
117        Requires the `pyarrow` extra to be installed.
118        """
119        if not _PYARROW_AVAILABLE:
120            raise ImportError(
121                "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'"  # noqa: E501
122            )
123        rb, cell_errors = self._sheet.to_arrow_with_errors()
124        if not cell_errors.errors:
125            return (rb, None)
126        return (rb, cell_errors)
127
128    def to_pandas(self) -> pd.DataFrame:
129        """Converts the sheet to a Pandas `DataFrame`.
130
131        Requires the `pandas` extra to be installed.
132        """
133        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
134        # which we don't implement. Using pyarrow conversion for now.
135        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
136        return self.to_arrow().to_pandas()
137
138    def to_polars(self) -> pl.DataFrame:
139        """Converts the sheet to a Polars `DataFrame`.
140
141        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
142        Requires the `polars` extra to be installed.
143        """
144        import polars as pl
145
146        return pl.DataFrame(self)
147
148    def __arrow_c_schema__(self) -> object:
149        """Export the schema as an `ArrowSchema` `PyCapsule`.
150
151        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
152
153        The Arrow PyCapsule Interface enables zero-copy data exchange with
154        Arrow-compatible libraries without requiring PyArrow as a dependency.
155        """
156        return self._sheet.__arrow_c_schema__()
157
158    def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
159        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
160
161        The optional `requested_schema` parameter allows for potential schema conversion.
162
163        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
164
165        The Arrow PyCapsule Interface enables zero-copy data exchange with
166        Arrow-compatible libraries without requiring PyArrow as a dependency.
167        """
168        return self._sheet.__arrow_c_array__(requested_schema)
169
170    def __repr__(self) -> str:
171        return self._sheet.__repr__()

A class representing a single sheet in an Excel File

ExcelSheet(sheet: _ExcelSheet)
58    def __init__(self, sheet: _ExcelSheet) -> None:
59        self._sheet = sheet
name: str
61    @property
62    def name(self) -> str:
63        """The name of the sheet"""
64        return self._sheet.name

The name of the sheet

width: int
66    @property
67    def width(self) -> int:
68        """The sheet's width"""
69        return self._sheet.width

The sheet's width

height: int
71    @property
72    def height(self) -> int:
73        """The sheet's height, with `skip_rows` and `nrows` applied"""
74        return self._sheet.height

The sheet's height, with skip_rows and nrows applied

total_height: int
76    @property
77    def total_height(self) -> int:
78        """The sheet's total height"""
79        return self._sheet.total_height

The sheet's total height

selected_columns: list[ColumnInfo]
81    @property
82    def selected_columns(self) -> list[ColumnInfo]:
83        """The sheet's selected columns"""
84        return self._sheet.selected_columns

The sheet's selected columns

def available_columns(self) -> list[ColumnInfo]:
86    def available_columns(self) -> list[ColumnInfo]:
87        """The columns available for the given sheet"""
88        return self._sheet.available_columns()

The columns available for the given sheet

specified_dtypes: 'DTypeMap | None'
90    @property
91    def specified_dtypes(self) -> DTypeMap | None:
92        """The dtypes specified for the sheet"""
93        return self._sheet.specified_dtypes

The dtypes specified for the sheet

visible: Literal['visible', 'hidden', 'veryhidden']
95    @property
96    def visible(self) -> SheetVisible:
97        """The visibility of the sheet"""
98        return self._sheet.visible

The visibility of the sheet

def to_arrow(self) -> pyarrow.lib.RecordBatch:
100    def to_arrow(self) -> pa.RecordBatch:
101        """Converts the sheet to a pyarrow `RecordBatch`
102
103        Requires the `pyarrow` extra to be installed.
104        """
105        if not _PYARROW_AVAILABLE:
106            raise ImportError(
107                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
108            )
109        return self._sheet.to_arrow()

Converts the sheet to a pyarrow RecordBatch

Requires the pyarrow extra to be installed.

def to_arrow_with_errors(self) -> tuple[pyarrow.lib.RecordBatch, CellErrors | None]:
111    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
112        """Converts the sheet to a pyarrow `RecordBatch` with error information.
113
114        Stores the positions of any values that cannot be parsed as the specified type and were
115        therefore converted to None.
116
117        Requires the `pyarrow` extra to be installed.
118        """
119        if not _PYARROW_AVAILABLE:
120            raise ImportError(
121                "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'"  # noqa: E501
122            )
123        rb, cell_errors = self._sheet.to_arrow_with_errors()
124        if not cell_errors.errors:
125            return (rb, None)
126        return (rb, cell_errors)

Converts the sheet to a pyarrow RecordBatch with error information.

Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None.

Requires the pyarrow extra to be installed.

def to_pandas(self) -> pandas.core.frame.DataFrame:
128    def to_pandas(self) -> pd.DataFrame:
129        """Converts the sheet to a Pandas `DataFrame`.
130
131        Requires the `pandas` extra to be installed.
132        """
133        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
134        # which we don't implement. Using pyarrow conversion for now.
135        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
136        return self.to_arrow().to_pandas()

Converts the sheet to a Pandas DataFrame.

Requires the pandas extra to be installed.

def to_polars(self) -> polars.dataframe.frame.DataFrame:
138    def to_polars(self) -> pl.DataFrame:
139        """Converts the sheet to a Polars `DataFrame`.
140
141        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
142        Requires the `polars` extra to be installed.
143        """
144        import polars as pl
145
146        return pl.DataFrame(self)

Converts the sheet to a Polars DataFrame.

Uses the Arrow PyCapsule Interface for zero-copy data exchange. Requires the polars extra to be installed.

class ExcelTable:
174class ExcelTable:
175    """A class representing a single table in an Excel file"""
176
177    def __init__(self, table: _ExcelTable) -> None:
178        self._table = table
179
180    @property
181    def name(self) -> str:
182        """The name of the table"""
183        return self._table.name
184
185    @property
186    def sheet_name(self) -> str:
187        """The name of the sheet this table belongs to"""
188        return self._table.sheet_name
189
190    @property
191    def width(self) -> int:
192        """The table's width"""
193        return self._table.width
194
195    @property
196    def height(self) -> int:
197        """The table's height"""
198        return self._table.height
199
200    @property
201    def total_height(self) -> int:
202        """The table's total height"""
203        return self._table.total_height
204
205    @property
206    def offset(self) -> int:
207        """The table's offset before data starts"""
208        return self._table.offset
209
210    @property
211    def selected_columns(self) -> list[ColumnInfo]:
212        """The table's selected columns"""
213        return self._table.selected_columns
214
215    def available_columns(self) -> list[ColumnInfo]:
216        """The columns available for the given table"""
217        return self._table.available_columns()
218
219    @property
220    def specified_dtypes(self) -> DTypeMap | None:
221        """The dtypes specified for the table"""
222        return self._table.specified_dtypes
223
224    def to_arrow(self) -> pa.RecordBatch:
225        """Converts the table to a pyarrow `RecordBatch`
226
227        Requires the `pyarrow` extra to be installed.
228        """
229        if not _PYARROW_AVAILABLE:
230            raise ImportError(
231                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
232            )
233        return self._table.to_arrow()
234
235    def to_pandas(self) -> pd.DataFrame:
236        """Converts the table to a Pandas `DataFrame`.
237
238        Requires the `pandas` extra to be installed.
239        """
240        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
241        # which we don't implement. Using pyarrow conversion for now.
242        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
243        return self.to_arrow().to_pandas()
244
245    def to_polars(self) -> pl.DataFrame:
246        """Converts the table to a Polars `DataFrame`.
247
248        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
249        Requires the `polars` extra to be installed.
250        """
251        import polars as pl
252
253        return pl.DataFrame(self)
254
255    def __arrow_c_schema__(self) -> object:
256        """Export the schema as an `ArrowSchema` `PyCapsule`.
257
258        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
259
260        The Arrow PyCapsule Interface enables zero-copy data exchange with
261        Arrow-compatible libraries without requiring PyArrow as a dependency.
262        """
263        return self._table.__arrow_c_schema__()
264
265    def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
266        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
267
268        The optional `requested_schema` parameter allows for potential schema conversion.
269
270        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
271
272        The Arrow PyCapsule Interface enables zero-copy data exchange with
273        Arrow-compatible libraries without requiring PyArrow as a dependency.
274        """
275        return self._table.__arrow_c_array__(requested_schema)

A class representing a single table in an Excel file

ExcelTable(table: _ExcelTable)
177    def __init__(self, table: _ExcelTable) -> None:
178        self._table = table
name: str
180    @property
181    def name(self) -> str:
182        """The name of the table"""
183        return self._table.name

The name of the table

sheet_name: str
185    @property
186    def sheet_name(self) -> str:
187        """The name of the sheet this table belongs to"""
188        return self._table.sheet_name

The name of the sheet this table belongs to

width: int
190    @property
191    def width(self) -> int:
192        """The table's width"""
193        return self._table.width

The table's width

height: int
195    @property
196    def height(self) -> int:
197        """The table's height"""
198        return self._table.height

The table's height

total_height: int
200    @property
201    def total_height(self) -> int:
202        """The table's total height"""
203        return self._table.total_height

The table's total height

offset: int
205    @property
206    def offset(self) -> int:
207        """The table's offset before data starts"""
208        return self._table.offset

The table's offset before data starts

selected_columns: list[ColumnInfo]
210    @property
211    def selected_columns(self) -> list[ColumnInfo]:
212        """The table's selected columns"""
213        return self._table.selected_columns

The table's selected columns

def available_columns(self) -> list[ColumnInfo]:
215    def available_columns(self) -> list[ColumnInfo]:
216        """The columns available for the given table"""
217        return self._table.available_columns()

The columns available for the given table

specified_dtypes: 'DTypeMap | None'
219    @property
220    def specified_dtypes(self) -> DTypeMap | None:
221        """The dtypes specified for the table"""
222        return self._table.specified_dtypes

The dtypes specified for the table

def to_arrow(self) -> pyarrow.lib.RecordBatch:
224    def to_arrow(self) -> pa.RecordBatch:
225        """Converts the table to a pyarrow `RecordBatch`
226
227        Requires the `pyarrow` extra to be installed.
228        """
229        if not _PYARROW_AVAILABLE:
230            raise ImportError(
231                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
232            )
233        return self._table.to_arrow()

Converts the table to a pyarrow RecordBatch

Requires the pyarrow extra to be installed.

def to_pandas(self) -> pandas.core.frame.DataFrame:
235    def to_pandas(self) -> pd.DataFrame:
236        """Converts the table to a Pandas `DataFrame`.
237
238        Requires the `pandas` extra to be installed.
239        """
240        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
241        # which we don't implement. Using pyarrow conversion for now.
242        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
243        return self.to_arrow().to_pandas()

Converts the table to a Pandas DataFrame.

Requires the pandas extra to be installed.

def to_polars(self) -> polars.dataframe.frame.DataFrame:
245    def to_polars(self) -> pl.DataFrame:
246        """Converts the table to a Polars `DataFrame`.
247
248        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
249        Requires the `polars` extra to be installed.
250        """
251        import polars as pl
252
253        return pl.DataFrame(self)

Converts the table to a Polars DataFrame.

Uses the Arrow PyCapsule Interface for zero-copy data exchange. Requires the polars extra to be installed.

DTypeFrom: TypeAlias = Literal['provided_for_all', 'provided_by_index', 'provided_by_name', 'guessed']
ColumnNameFrom: TypeAlias = Literal['provided', 'looked_up', 'generated']
class ColumnInfo:

Metadata about a single column in a sheet.

name

str. The name of the column

dtype_from

fastexcel.DTypeFrom. How the dtype of the column was determined.

One of three possible values:

  • "provided_by_index": The dtype was specified via the column index
  • "provided_by_name": The dtype was specified via the column name
  • "guessed": The dtype was determined from the content of the column
column_name_from

fastexcel.ColumnNameFrom. How the name of the column was determined.

One of three possible values:

  • "provided": The column name was provided via the use_columns parameter
  • "looked_up": The column name was looked up from the data found in the sheet
  • "generated": The column name was generated from the column index, either because header_row was None, or because it could not be looked up
dtype

fastexcel.DType. The dtype of the column

index

int. The index of the column

class DefinedName:
formula
name
class CellError:
position

(int, int). The original row and column of the error

row_offset

int. The row offset

detail

str. The error message

offset_position
class CellErrors:
errors
class FastExcelError(builtins.Exception):

The base class for all fastexcel errors

class CannotRetrieveCellDataError(fastexcel.FastExcelError):

Data for a given cell cannot be retrieved

class CalamineCellError(fastexcel.FastExcelError):

calamine returned an error regarding the content of the cell

class CalamineError(fastexcel.FastExcelError):

Generic calamine error

class SheetNotFoundError(fastexcel.FastExcelError):

Sheet was not found

class ColumnNotFoundError(fastexcel.FastExcelError):

Column was not found

class ArrowError(fastexcel.FastExcelError):

Generic arrow error

class InvalidParametersError(fastexcel.FastExcelError):

Provided parameters are invalid

class UnsupportedColumnTypeCombinationError(fastexcel.FastExcelError):

Column contains an unsupported type combination