fastexcel
1from __future__ import annotations 2 3import sys 4import typing 5from typing import TYPE_CHECKING, Callable, Literal 6 7if sys.version_info < (3, 10): 8 from typing_extensions import TypeAlias 9else: 10 from typing import TypeAlias 11 12if TYPE_CHECKING: 13 import pandas as pd 14 import polars as pl 15 import pyarrow as pa 16 17from os.path import expanduser 18from pathlib import Path 19 20try: 21 import importlib.util 22 23 importlib.util.find_spec("pyarrow") 24 _PYARROW_AVAILABLE = True 25except ImportError: 26 _PYARROW_AVAILABLE = False 27 28from ._fastexcel import ( 29 ArrowError, 30 CalamineCellError, 31 CalamineError, 32 CannotRetrieveCellDataError, 33 CellError, 34 CellErrors, 35 ColumnInfo, 36 ColumnInfoNoDtype, 37 ColumnNotFoundError, 38 FastExcelError, 39 InvalidParametersError, 40 SheetNotFoundError, 41 UnsupportedColumnTypeCombinationError, 42 __version__, 43 _ExcelReader, 44 _ExcelSheet, 45 _ExcelTable, 46) 47from ._fastexcel import read_excel as _read_excel 48 49DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 50DTypeMap: TypeAlias = "dict[str | int, DType]" 51ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 52DTypeFrom: TypeAlias = Literal[ 53 "provided_for_all", "provided_by_index", "provided_by_name", "guessed" 54] 55SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 56 57 58class ExcelSheet: 59 """A class representing a single sheet in an Excel File""" 60 61 def __init__(self, sheet: _ExcelSheet) -> None: 62 self._sheet = sheet 63 64 @property 65 def name(self) -> str: 66 """The name of the sheet""" 67 return self._sheet.name 68 69 @property 70 def width(self) -> int: 71 """The sheet's width""" 72 return self._sheet.width 73 74 @property 75 def height(self) -> int: 76 """The sheet's height, with `skip_rows` and `nrows` applied""" 77 return self._sheet.height 78 79 @property 80 def total_height(self) -> int: 81 """The sheet's total height""" 82 return self._sheet.total_height 83 84 @property 85 def selected_columns(self) -> list[ColumnInfo]: 86 """The sheet's selected columns""" 87 return self._sheet.selected_columns 88 89 def available_columns(self) -> list[ColumnInfo]: 90 """The columns available for the given sheet""" 91 return self._sheet.available_columns() 92 93 @property 94 def specified_dtypes(self) -> DTypeMap | None: 95 """The dtypes specified for the sheet""" 96 return self._sheet.specified_dtypes 97 98 @property 99 def visible(self) -> SheetVisible: 100 """The visibility of the sheet""" 101 return self._sheet.visible 102 103 def to_arrow(self) -> "pa.RecordBatch": 104 """Converts the sheet to a pyarrow `RecordBatch` 105 106 Requires the `pyarrow` extra to be installed. 107 """ 108 if not _PYARROW_AVAILABLE: 109 raise ImportError( 110 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 111 ) 112 return self._sheet.to_arrow() 113 114 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 115 """Converts the sheet to a pyarrow `RecordBatch` with error information. 116 117 Stores the positions of any values that cannot be parsed as the specified type and were 118 therefore converted to None. 119 120 Requires the `pyarrow` extra to be installed. 121 """ 122 if not _PYARROW_AVAILABLE: 123 raise ImportError( 124 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 125 ) 126 rb, cell_errors = self._sheet.to_arrow_with_errors() 127 if not cell_errors.errors: 128 return (rb, None) 129 return (rb, cell_errors) 130 131 def to_pandas(self) -> "pd.DataFrame": 132 """Converts the sheet to a Pandas `DataFrame`. 133 134 Requires the `pandas` extra to be installed. 135 """ 136 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 137 # which we don't implement. Using pyarrow conversion for now. 138 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 139 return self.to_arrow().to_pandas() 140 141 def to_polars(self) -> "pl.DataFrame": 142 """Converts the sheet to a Polars `DataFrame`. 143 144 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 145 Requires the `polars` extra to be installed. 146 """ 147 import polars as pl 148 149 return pl.DataFrame(self) 150 151 def __arrow_c_schema__(self) -> object: 152 """Export the schema as an `ArrowSchema` `PyCapsule`. 153 154 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 155 156 The Arrow PyCapsule Interface enables zero-copy data exchange with 157 Arrow-compatible libraries without requiring PyArrow as a dependency. 158 """ 159 return self._sheet.__arrow_c_schema__() 160 161 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 162 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 163 164 The optional `requested_schema` parameter allows for potential schema conversion. 165 166 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 167 168 The Arrow PyCapsule Interface enables zero-copy data exchange with 169 Arrow-compatible libraries without requiring PyArrow as a dependency. 170 """ 171 return self._sheet.__arrow_c_array__(requested_schema) 172 173 def __repr__(self) -> str: 174 return self._sheet.__repr__() 175 176 177class ExcelTable: 178 """A class representing a single table in an Excel file""" 179 180 def __init__(self, table: _ExcelTable) -> None: 181 self._table = table 182 183 @property 184 def name(self) -> str: 185 """The name of the table""" 186 return self._table.name 187 188 @property 189 def sheet_name(self) -> str: 190 """The name of the sheet this table belongs to""" 191 return self._table.sheet_name 192 193 @property 194 def width(self) -> int: 195 """The table's width""" 196 return self._table.width 197 198 @property 199 def height(self) -> int: 200 """The table's height""" 201 return self._table.height 202 203 @property 204 def total_height(self) -> int: 205 """The table's total height""" 206 return self._table.total_height 207 208 @property 209 def offset(self) -> int: 210 """The table's offset before data starts""" 211 return self._table.offset 212 213 @property 214 def selected_columns(self) -> list[ColumnInfo]: 215 """The table's selected columns""" 216 return self._table.selected_columns 217 218 def available_columns(self) -> list[ColumnInfo]: 219 """The columns available for the given table""" 220 return self._table.available_columns() 221 222 @property 223 def specified_dtypes(self) -> DTypeMap | None: 224 """The dtypes specified for the table""" 225 return self._table.specified_dtypes 226 227 def to_arrow(self) -> "pa.RecordBatch": 228 """Converts the table to a pyarrow `RecordBatch` 229 230 Requires the `pyarrow` extra to be installed. 231 """ 232 if not _PYARROW_AVAILABLE: 233 raise ImportError( 234 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 235 ) 236 return self._table.to_arrow() 237 238 def to_pandas(self) -> "pd.DataFrame": 239 """Converts the table to a Pandas `DataFrame`. 240 241 Requires the `pandas` extra to be installed. 242 """ 243 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 244 # which we don't implement. Using pyarrow conversion for now. 245 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 246 return self.to_arrow().to_pandas() 247 248 def to_polars(self) -> "pl.DataFrame": 249 """Converts the table to a Polars `DataFrame`. 250 251 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 252 Requires the `polars` extra to be installed. 253 """ 254 import polars as pl 255 256 return pl.DataFrame(self) 257 258 def __arrow_c_schema__(self) -> object: 259 """Export the schema as an `ArrowSchema` `PyCapsule`. 260 261 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 262 263 The Arrow PyCapsule Interface enables zero-copy data exchange with 264 Arrow-compatible libraries without requiring PyArrow as a dependency. 265 """ 266 return self._table.__arrow_c_schema__() 267 268 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 269 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 270 271 The optional `requested_schema` parameter allows for potential schema conversion. 272 273 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 274 275 The Arrow PyCapsule Interface enables zero-copy data exchange with 276 Arrow-compatible libraries without requiring PyArrow as a dependency. 277 """ 278 return self._table.__arrow_c_array__(requested_schema) 279 280 281class ExcelReader: 282 """A class representing an open Excel file and allowing to read its sheets""" 283 284 def __init__(self, reader: _ExcelReader) -> None: 285 self._reader = reader 286 287 @property 288 def sheet_names(self) -> list[str]: 289 """The list of sheet names""" 290 return self._reader.sheet_names 291 292 @typing.overload 293 def load_sheet( 294 self, 295 idx_or_name: int | str, 296 *, 297 header_row: int | None = 0, 298 column_names: list[str] | None = None, 299 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 300 n_rows: int | None = None, 301 schema_sample_rows: int | None = 1_000, 302 dtype_coercion: Literal["coerce", "strict"] = "coerce", 303 use_columns: list[str] 304 | list[int] 305 | str 306 | Callable[[ColumnInfoNoDtype], bool] 307 | None = None, 308 dtypes: DType | DTypeMap | None = None, 309 eager: Literal[False] = ..., 310 ) -> ExcelSheet: ... 311 312 @typing.overload 313 def load_sheet( 314 self, 315 idx_or_name: int | str, 316 *, 317 header_row: int | None = 0, 318 column_names: list[str] | None = None, 319 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 320 n_rows: int | None = None, 321 schema_sample_rows: int | None = 1_000, 322 dtype_coercion: Literal["coerce", "strict"] = "coerce", 323 use_columns: list[str] 324 | list[int] 325 | str 326 | Callable[[ColumnInfoNoDtype], bool] 327 | None = None, 328 dtypes: DType | DTypeMap | None = None, 329 eager: Literal[True] = ..., 330 ) -> "pa.RecordBatch": ... 331 332 def load_sheet( 333 self, 334 idx_or_name: int | str, 335 *, 336 header_row: int | None = 0, 337 column_names: list[str] | None = None, 338 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 339 n_rows: int | None = None, 340 schema_sample_rows: int | None = 1_000, 341 dtype_coercion: Literal["coerce", "strict"] = "coerce", 342 use_columns: list[str] 343 | list[int] 344 | str 345 | Callable[[ColumnInfoNoDtype], bool] 346 | None = None, 347 dtypes: DType | DTypeMap | None = None, 348 eager: bool = False, 349 ) -> "ExcelSheet | pa.RecordBatch": 350 """Loads a sheet by index or name. 351 352 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 353 :param header_row: The index of the row containing the column labels, default index is 0. 354 If `None`, the sheet does not have any column labels. 355 Any rows before the `header_row` will be automatically skipped. 356 :param column_names: Overrides headers found in the document. 357 If `column_names` is used, `header_row` will be ignored. 358 :param n_rows: Specifies how many rows should be loaded. 359 If `None`, all rows are loaded 360 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 361 Any rows before the `header_row` are automatically skipped. 362 It means row indices are relative to data rows, not the sheet! 363 Can be one of: 364 - `int`: Skip this many rows after the header row 365 - `list[int]`: Skip specific row indices (0-based relative to data rows) 366 - `Callable[[int], bool]`: Function that receives row index (0-based 367 relative to data rows) and returns True to skip the row 368 - `None`: If `header_row` is None, skips empty rows at beginning 369 :param schema_sample_rows: Specifies how many rows should be used to determine 370 the dtype of a column. Cannot be 0. A specific dtype can be 371 enforced for some or all columns through the `dtypes` parameter. 372 If `None`, all rows will be used. 373 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 374 will try to coerce different dtypes in a column to the same one, 375 whereas `strict` will raise an error in case a column contains 376 several dtypes. Note that this only applies to columns whose dtype 377 is guessed, i.e. not specified via `dtypes`. 378 :param use_columns: Specifies the columns to use. Can either be: 379 - `None` to select all columns 380 - A list of strings and ints, the column names and/or indices 381 (starting at 0) 382 - A string, a comma separated list of Excel column letters and column 383 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 384 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 385 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 386 ranges (e.g. `":C"` to select columns from A to C). These can be 387 combined for "except" patterns (e.g. `":C,E:"` to select everything 388 except column D) 389 - A callable, a function that takes a column and returns a boolean 390 indicating whether the column should be used 391 :param dtypes: An optional dtype (for all columns) 392 or dict of dtypes with keys as column indices or names. 393 :param eager: Specifies whether the sheet should be loaded eagerly. 394 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 395 whereas `True` will load it eagerly via `pyarrow`. 396 397 Eager loading requires the `pyarrow` extra to be installed. 398 """ 399 sheet_or_rb = self._reader.load_sheet( 400 idx_or_name=idx_or_name, 401 header_row=header_row, 402 column_names=column_names, 403 skip_rows=skip_rows, 404 n_rows=n_rows, 405 schema_sample_rows=schema_sample_rows, 406 dtype_coercion=dtype_coercion, 407 use_columns=use_columns, 408 dtypes=dtypes, 409 eager=eager, 410 ) 411 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb) 412 413 def table_names(self, sheet_name: str | None = None) -> list[str]: 414 """The list of table names. 415 416 Will return an empty list if no tables are found. 417 418 :param sheet_name: If given, will limit the list to the given sheet, will be faster 419 too. 420 """ 421 return self._reader.table_names(sheet_name) 422 423 @typing.overload 424 def load_table( 425 self, 426 name: str, 427 *, 428 header_row: int | None = None, 429 column_names: list[str] | None = None, 430 skip_rows: int | None = None, 431 n_rows: int | None = None, 432 schema_sample_rows: int | None = 1_000, 433 dtype_coercion: Literal["coerce", "strict"] = "coerce", 434 use_columns: list[str] 435 | list[int] 436 | str 437 | Callable[[ColumnInfoNoDtype], bool] 438 | None = None, 439 dtypes: DType | DTypeMap | None = None, 440 eager: Literal[False] = ..., 441 ) -> ExcelTable: ... 442 443 @typing.overload 444 def load_table( 445 self, 446 name: str, 447 *, 448 header_row: int | None = None, 449 column_names: list[str] | None = None, 450 skip_rows: int | None = None, 451 n_rows: int | None = None, 452 schema_sample_rows: int | None = 1_000, 453 dtype_coercion: Literal["coerce", "strict"] = "coerce", 454 use_columns: list[str] 455 | list[int] 456 | str 457 | Callable[[ColumnInfoNoDtype], bool] 458 | None = None, 459 dtypes: DType | DTypeMap | None = None, 460 eager: Literal[True] = ..., 461 ) -> "pa.RecordBatch": ... 462 463 def load_table( 464 self, 465 name: str, 466 *, 467 header_row: int | None = None, 468 column_names: list[str] | None = None, 469 skip_rows: int | None = None, 470 n_rows: int | None = None, 471 schema_sample_rows: int | None = 1_000, 472 dtype_coercion: Literal["coerce", "strict"] = "coerce", 473 use_columns: list[str] 474 | list[int] 475 | str 476 | Callable[[ColumnInfoNoDtype], bool] 477 | None = None, 478 dtypes: DType | DTypeMap | None = None, 479 eager: bool = False, 480 ) -> "ExcelTable | pa.RecordBatch": 481 """Loads a table by name. 482 483 :param name: The name of the table to load. 484 :param header_row: The index of the row containing the column labels. 485 If `None`, the table's column names will be used. 486 Any rows before the `header_row` will be automatically skipped. 487 :param column_names: Overrides headers found in the document. 488 If `column_names` is used, `header_row` will be ignored. 489 :param n_rows: Specifies how many rows should be loaded. 490 If `None`, all rows are loaded 491 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 492 Any rows before the `header_row` are automatically skipped. 493 If `header_row` is `None`, it skips the number of rows from the 494 start of the sheet. 495 :param schema_sample_rows: Specifies how many rows should be used to determine 496 the dtype of a column. Cannot be 0. A specific dtype can be 497 enforced for some or all columns through the `dtypes` parameter. 498 If `None`, all rows will be used. 499 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 500 will try to coerce different dtypes in a column to the same one, 501 whereas `strict` will raise an error in case a column contains 502 several dtypes. Note that this only applies to columns whose dtype 503 is guessed, i.e. not specified via `dtypes`. 504 :param use_columns: Specifies the columns to use. Can either be: 505 - `None` to select all columns 506 - A list of strings and ints, the column names and/or indices 507 (starting at 0) 508 - A string, a comma separated list of Excel column letters and column 509 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 510 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 511 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 512 ranges (e.g. `":C"` to select columns from A to C). These can be 513 combined for "except" patterns (e.g. `":C,E:"` to select everything 514 except column D) 515 - A callable, a function that takes a column and returns a boolean 516 indicating whether the column should be used 517 :param dtypes: An optional dtype (for all columns) 518 or dict of dtypes with keys as column indices or names. 519 :param eager: Specifies whether the table should be loaded eagerly. 520 `False` (default) will load the table lazily using the `PyCapsule` interface, 521 whereas `True` will load it eagerly via `pyarrow`. 522 523 Eager loading requires the `pyarrow` extra to be installed. 524 """ 525 if eager: 526 return self._reader.load_table( 527 name=name, 528 header_row=header_row, 529 column_names=column_names, 530 skip_rows=skip_rows, 531 n_rows=n_rows, 532 schema_sample_rows=schema_sample_rows, 533 dtype_coercion=dtype_coercion, 534 use_columns=use_columns, 535 dtypes=dtypes, 536 eager=True, 537 ) 538 else: 539 return ExcelTable( 540 self._reader.load_table( 541 name=name, 542 header_row=header_row, 543 column_names=column_names, 544 skip_rows=skip_rows, 545 n_rows=n_rows, 546 schema_sample_rows=schema_sample_rows, 547 dtype_coercion=dtype_coercion, 548 use_columns=use_columns, 549 dtypes=dtypes, 550 eager=False, 551 ) 552 ) 553 554 def load_sheet_eager( 555 self, 556 idx_or_name: int | str, 557 *, 558 header_row: int | None = 0, 559 column_names: list[str] | None = None, 560 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 561 n_rows: int | None = None, 562 schema_sample_rows: int | None = 1_000, 563 dtype_coercion: Literal["coerce", "strict"] = "coerce", 564 use_columns: list[str] | list[int] | str | None = None, 565 dtypes: DType | DTypeMap | None = None, 566 ) -> "pa.RecordBatch": 567 """Loads a sheet eagerly by index or name. 568 569 For xlsx files, this will be faster and more memory-efficient, as it will use 570 `worksheet_range_ref` under the hood, which returns borrowed types. 571 572 Refer to `load_sheet` for parameter documentation 573 574 Requires the `pyarrow` extra to be installed. 575 """ 576 return self._reader.load_sheet( 577 idx_or_name=idx_or_name, 578 header_row=header_row, 579 column_names=column_names, 580 skip_rows=skip_rows, 581 n_rows=n_rows, 582 schema_sample_rows=schema_sample_rows, 583 dtype_coercion=dtype_coercion, 584 use_columns=use_columns, 585 dtypes=dtypes, 586 eager=True, 587 ) 588 589 def load_sheet_by_name( 590 self, 591 name: str, 592 *, 593 header_row: int | None = 0, 594 column_names: list[str] | None = None, 595 skip_rows: int | None = None, 596 n_rows: int | None = None, 597 schema_sample_rows: int | None = 1_000, 598 dtype_coercion: Literal["coerce", "strict"] = "coerce", 599 use_columns: list[str] 600 | list[int] 601 | str 602 | Callable[[ColumnInfoNoDtype], bool] 603 | None = None, 604 dtypes: DType | DTypeMap | None = None, 605 ) -> ExcelSheet: 606 """Loads a sheet by name. 607 608 Refer to `load_sheet` for parameter documentation 609 """ 610 return self.load_sheet( 611 name, 612 header_row=header_row, 613 column_names=column_names, 614 skip_rows=skip_rows, 615 n_rows=n_rows, 616 schema_sample_rows=schema_sample_rows, 617 dtype_coercion=dtype_coercion, 618 use_columns=use_columns, 619 dtypes=dtypes, 620 ) 621 622 def load_sheet_by_idx( 623 self, 624 idx: int, 625 *, 626 header_row: int | None = 0, 627 column_names: list[str] | None = None, 628 skip_rows: int | None = None, 629 n_rows: int | None = None, 630 schema_sample_rows: int | None = 1_000, 631 dtype_coercion: Literal["coerce", "strict"] = "coerce", 632 use_columns: list[str] 633 | list[int] 634 | str 635 | Callable[[ColumnInfoNoDtype], bool] 636 | None = None, 637 dtypes: DType | DTypeMap | None = None, 638 ) -> ExcelSheet: 639 """Loads a sheet by index. 640 641 Refer to `load_sheet` for parameter documentation 642 """ 643 return self.load_sheet( 644 idx, 645 header_row=header_row, 646 column_names=column_names, 647 skip_rows=skip_rows, 648 n_rows=n_rows, 649 schema_sample_rows=schema_sample_rows, 650 dtype_coercion=dtype_coercion, 651 use_columns=use_columns, 652 dtypes=dtypes, 653 ) 654 655 def __repr__(self) -> str: 656 return self._reader.__repr__() 657 658 659def read_excel(source: Path | str | bytes) -> ExcelReader: 660 """Opens and loads an excel file. 661 662 :param source: The path to a file or its content as bytes 663 """ 664 if isinstance(source, (str, Path)): 665 source = expanduser(source) 666 return ExcelReader(_read_excel(source)) 667 668 669__all__ = ( 670 # version 671 "__version__", 672 # main entrypoint 673 "read_excel", 674 # Python types 675 "DType", 676 "DTypeMap", 677 # Excel reader 678 "ExcelReader", 679 # Excel sheet 680 "ExcelSheet", 681 # Excel table 682 "ExcelTable", 683 # Column metadata 684 "DTypeFrom", 685 "ColumnNameFrom", 686 "ColumnInfo", 687 # Parse error information 688 "CellError", 689 "CellErrors", 690 # Exceptions 691 "FastExcelError", 692 "CannotRetrieveCellDataError", 693 "CalamineCellError", 694 "CalamineError", 695 "SheetNotFoundError", 696 "ColumnNotFoundError", 697 "ArrowError", 698 "InvalidParametersError", 699 "UnsupportedColumnTypeCombinationError", 700)
660def read_excel(source: Path | str | bytes) -> ExcelReader: 661 """Opens and loads an excel file. 662 663 :param source: The path to a file or its content as bytes 664 """ 665 if isinstance(source, (str, Path)): 666 source = expanduser(source) 667 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
282class ExcelReader: 283 """A class representing an open Excel file and allowing to read its sheets""" 284 285 def __init__(self, reader: _ExcelReader) -> None: 286 self._reader = reader 287 288 @property 289 def sheet_names(self) -> list[str]: 290 """The list of sheet names""" 291 return self._reader.sheet_names 292 293 @typing.overload 294 def load_sheet( 295 self, 296 idx_or_name: int | str, 297 *, 298 header_row: int | None = 0, 299 column_names: list[str] | None = None, 300 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 301 n_rows: int | None = None, 302 schema_sample_rows: int | None = 1_000, 303 dtype_coercion: Literal["coerce", "strict"] = "coerce", 304 use_columns: list[str] 305 | list[int] 306 | str 307 | Callable[[ColumnInfoNoDtype], bool] 308 | None = None, 309 dtypes: DType | DTypeMap | None = None, 310 eager: Literal[False] = ..., 311 ) -> ExcelSheet: ... 312 313 @typing.overload 314 def load_sheet( 315 self, 316 idx_or_name: int | str, 317 *, 318 header_row: int | None = 0, 319 column_names: list[str] | None = None, 320 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 321 n_rows: int | None = None, 322 schema_sample_rows: int | None = 1_000, 323 dtype_coercion: Literal["coerce", "strict"] = "coerce", 324 use_columns: list[str] 325 | list[int] 326 | str 327 | Callable[[ColumnInfoNoDtype], bool] 328 | None = None, 329 dtypes: DType | DTypeMap | None = None, 330 eager: Literal[True] = ..., 331 ) -> "pa.RecordBatch": ... 332 333 def load_sheet( 334 self, 335 idx_or_name: int | str, 336 *, 337 header_row: int | None = 0, 338 column_names: list[str] | None = None, 339 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 340 n_rows: int | None = None, 341 schema_sample_rows: int | None = 1_000, 342 dtype_coercion: Literal["coerce", "strict"] = "coerce", 343 use_columns: list[str] 344 | list[int] 345 | str 346 | Callable[[ColumnInfoNoDtype], bool] 347 | None = None, 348 dtypes: DType | DTypeMap | None = None, 349 eager: bool = False, 350 ) -> "ExcelSheet | pa.RecordBatch": 351 """Loads a sheet by index or name. 352 353 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 354 :param header_row: The index of the row containing the column labels, default index is 0. 355 If `None`, the sheet does not have any column labels. 356 Any rows before the `header_row` will be automatically skipped. 357 :param column_names: Overrides headers found in the document. 358 If `column_names` is used, `header_row` will be ignored. 359 :param n_rows: Specifies how many rows should be loaded. 360 If `None`, all rows are loaded 361 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 362 Any rows before the `header_row` are automatically skipped. 363 It means row indices are relative to data rows, not the sheet! 364 Can be one of: 365 - `int`: Skip this many rows after the header row 366 - `list[int]`: Skip specific row indices (0-based relative to data rows) 367 - `Callable[[int], bool]`: Function that receives row index (0-based 368 relative to data rows) and returns True to skip the row 369 - `None`: If `header_row` is None, skips empty rows at beginning 370 :param schema_sample_rows: Specifies how many rows should be used to determine 371 the dtype of a column. Cannot be 0. A specific dtype can be 372 enforced for some or all columns through the `dtypes` parameter. 373 If `None`, all rows will be used. 374 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 375 will try to coerce different dtypes in a column to the same one, 376 whereas `strict` will raise an error in case a column contains 377 several dtypes. Note that this only applies to columns whose dtype 378 is guessed, i.e. not specified via `dtypes`. 379 :param use_columns: Specifies the columns to use. Can either be: 380 - `None` to select all columns 381 - A list of strings and ints, the column names and/or indices 382 (starting at 0) 383 - A string, a comma separated list of Excel column letters and column 384 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 385 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 386 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 387 ranges (e.g. `":C"` to select columns from A to C). These can be 388 combined for "except" patterns (e.g. `":C,E:"` to select everything 389 except column D) 390 - A callable, a function that takes a column and returns a boolean 391 indicating whether the column should be used 392 :param dtypes: An optional dtype (for all columns) 393 or dict of dtypes with keys as column indices or names. 394 :param eager: Specifies whether the sheet should be loaded eagerly. 395 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 396 whereas `True` will load it eagerly via `pyarrow`. 397 398 Eager loading requires the `pyarrow` extra to be installed. 399 """ 400 sheet_or_rb = self._reader.load_sheet( 401 idx_or_name=idx_or_name, 402 header_row=header_row, 403 column_names=column_names, 404 skip_rows=skip_rows, 405 n_rows=n_rows, 406 schema_sample_rows=schema_sample_rows, 407 dtype_coercion=dtype_coercion, 408 use_columns=use_columns, 409 dtypes=dtypes, 410 eager=eager, 411 ) 412 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb) 413 414 def table_names(self, sheet_name: str | None = None) -> list[str]: 415 """The list of table names. 416 417 Will return an empty list if no tables are found. 418 419 :param sheet_name: If given, will limit the list to the given sheet, will be faster 420 too. 421 """ 422 return self._reader.table_names(sheet_name) 423 424 @typing.overload 425 def load_table( 426 self, 427 name: str, 428 *, 429 header_row: int | None = None, 430 column_names: list[str] | None = None, 431 skip_rows: int | None = None, 432 n_rows: int | None = None, 433 schema_sample_rows: int | None = 1_000, 434 dtype_coercion: Literal["coerce", "strict"] = "coerce", 435 use_columns: list[str] 436 | list[int] 437 | str 438 | Callable[[ColumnInfoNoDtype], bool] 439 | None = None, 440 dtypes: DType | DTypeMap | None = None, 441 eager: Literal[False] = ..., 442 ) -> ExcelTable: ... 443 444 @typing.overload 445 def load_table( 446 self, 447 name: str, 448 *, 449 header_row: int | None = None, 450 column_names: list[str] | None = None, 451 skip_rows: int | None = None, 452 n_rows: int | None = None, 453 schema_sample_rows: int | None = 1_000, 454 dtype_coercion: Literal["coerce", "strict"] = "coerce", 455 use_columns: list[str] 456 | list[int] 457 | str 458 | Callable[[ColumnInfoNoDtype], bool] 459 | None = None, 460 dtypes: DType | DTypeMap | None = None, 461 eager: Literal[True] = ..., 462 ) -> "pa.RecordBatch": ... 463 464 def load_table( 465 self, 466 name: str, 467 *, 468 header_row: int | None = None, 469 column_names: list[str] | None = None, 470 skip_rows: int | None = None, 471 n_rows: int | None = None, 472 schema_sample_rows: int | None = 1_000, 473 dtype_coercion: Literal["coerce", "strict"] = "coerce", 474 use_columns: list[str] 475 | list[int] 476 | str 477 | Callable[[ColumnInfoNoDtype], bool] 478 | None = None, 479 dtypes: DType | DTypeMap | None = None, 480 eager: bool = False, 481 ) -> "ExcelTable | pa.RecordBatch": 482 """Loads a table by name. 483 484 :param name: The name of the table to load. 485 :param header_row: The index of the row containing the column labels. 486 If `None`, the table's column names will be used. 487 Any rows before the `header_row` will be automatically skipped. 488 :param column_names: Overrides headers found in the document. 489 If `column_names` is used, `header_row` will be ignored. 490 :param n_rows: Specifies how many rows should be loaded. 491 If `None`, all rows are loaded 492 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 493 Any rows before the `header_row` are automatically skipped. 494 If `header_row` is `None`, it skips the number of rows from the 495 start of the sheet. 496 :param schema_sample_rows: Specifies how many rows should be used to determine 497 the dtype of a column. Cannot be 0. A specific dtype can be 498 enforced for some or all columns through the `dtypes` parameter. 499 If `None`, all rows will be used. 500 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 501 will try to coerce different dtypes in a column to the same one, 502 whereas `strict` will raise an error in case a column contains 503 several dtypes. Note that this only applies to columns whose dtype 504 is guessed, i.e. not specified via `dtypes`. 505 :param use_columns: Specifies the columns to use. Can either be: 506 - `None` to select all columns 507 - A list of strings and ints, the column names and/or indices 508 (starting at 0) 509 - A string, a comma separated list of Excel column letters and column 510 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 511 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 512 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 513 ranges (e.g. `":C"` to select columns from A to C). These can be 514 combined for "except" patterns (e.g. `":C,E:"` to select everything 515 except column D) 516 - A callable, a function that takes a column and returns a boolean 517 indicating whether the column should be used 518 :param dtypes: An optional dtype (for all columns) 519 or dict of dtypes with keys as column indices or names. 520 :param eager: Specifies whether the table should be loaded eagerly. 521 `False` (default) will load the table lazily using the `PyCapsule` interface, 522 whereas `True` will load it eagerly via `pyarrow`. 523 524 Eager loading requires the `pyarrow` extra to be installed. 525 """ 526 if eager: 527 return self._reader.load_table( 528 name=name, 529 header_row=header_row, 530 column_names=column_names, 531 skip_rows=skip_rows, 532 n_rows=n_rows, 533 schema_sample_rows=schema_sample_rows, 534 dtype_coercion=dtype_coercion, 535 use_columns=use_columns, 536 dtypes=dtypes, 537 eager=True, 538 ) 539 else: 540 return ExcelTable( 541 self._reader.load_table( 542 name=name, 543 header_row=header_row, 544 column_names=column_names, 545 skip_rows=skip_rows, 546 n_rows=n_rows, 547 schema_sample_rows=schema_sample_rows, 548 dtype_coercion=dtype_coercion, 549 use_columns=use_columns, 550 dtypes=dtypes, 551 eager=False, 552 ) 553 ) 554 555 def load_sheet_eager( 556 self, 557 idx_or_name: int | str, 558 *, 559 header_row: int | None = 0, 560 column_names: list[str] | None = None, 561 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 562 n_rows: int | None = None, 563 schema_sample_rows: int | None = 1_000, 564 dtype_coercion: Literal["coerce", "strict"] = "coerce", 565 use_columns: list[str] | list[int] | str | None = None, 566 dtypes: DType | DTypeMap | None = None, 567 ) -> "pa.RecordBatch": 568 """Loads a sheet eagerly by index or name. 569 570 For xlsx files, this will be faster and more memory-efficient, as it will use 571 `worksheet_range_ref` under the hood, which returns borrowed types. 572 573 Refer to `load_sheet` for parameter documentation 574 575 Requires the `pyarrow` extra to be installed. 576 """ 577 return self._reader.load_sheet( 578 idx_or_name=idx_or_name, 579 header_row=header_row, 580 column_names=column_names, 581 skip_rows=skip_rows, 582 n_rows=n_rows, 583 schema_sample_rows=schema_sample_rows, 584 dtype_coercion=dtype_coercion, 585 use_columns=use_columns, 586 dtypes=dtypes, 587 eager=True, 588 ) 589 590 def load_sheet_by_name( 591 self, 592 name: str, 593 *, 594 header_row: int | None = 0, 595 column_names: list[str] | None = None, 596 skip_rows: int | None = None, 597 n_rows: int | None = None, 598 schema_sample_rows: int | None = 1_000, 599 dtype_coercion: Literal["coerce", "strict"] = "coerce", 600 use_columns: list[str] 601 | list[int] 602 | str 603 | Callable[[ColumnInfoNoDtype], bool] 604 | None = None, 605 dtypes: DType | DTypeMap | None = None, 606 ) -> ExcelSheet: 607 """Loads a sheet by name. 608 609 Refer to `load_sheet` for parameter documentation 610 """ 611 return self.load_sheet( 612 name, 613 header_row=header_row, 614 column_names=column_names, 615 skip_rows=skip_rows, 616 n_rows=n_rows, 617 schema_sample_rows=schema_sample_rows, 618 dtype_coercion=dtype_coercion, 619 use_columns=use_columns, 620 dtypes=dtypes, 621 ) 622 623 def load_sheet_by_idx( 624 self, 625 idx: int, 626 *, 627 header_row: int | None = 0, 628 column_names: list[str] | None = None, 629 skip_rows: int | None = None, 630 n_rows: int | None = None, 631 schema_sample_rows: int | None = 1_000, 632 dtype_coercion: Literal["coerce", "strict"] = "coerce", 633 use_columns: list[str] 634 | list[int] 635 | str 636 | Callable[[ColumnInfoNoDtype], bool] 637 | None = None, 638 dtypes: DType | DTypeMap | None = None, 639 ) -> ExcelSheet: 640 """Loads a sheet by index. 641 642 Refer to `load_sheet` for parameter documentation 643 """ 644 return self.load_sheet( 645 idx, 646 header_row=header_row, 647 column_names=column_names, 648 skip_rows=skip_rows, 649 n_rows=n_rows, 650 schema_sample_rows=schema_sample_rows, 651 dtype_coercion=dtype_coercion, 652 use_columns=use_columns, 653 dtypes=dtypes, 654 ) 655 656 def __repr__(self) -> str: 657 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
288 @property 289 def sheet_names(self) -> list[str]: 290 """The list of sheet names""" 291 return self._reader.sheet_names
The list of sheet names
333 def load_sheet( 334 self, 335 idx_or_name: int | str, 336 *, 337 header_row: int | None = 0, 338 column_names: list[str] | None = None, 339 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 340 n_rows: int | None = None, 341 schema_sample_rows: int | None = 1_000, 342 dtype_coercion: Literal["coerce", "strict"] = "coerce", 343 use_columns: list[str] 344 | list[int] 345 | str 346 | Callable[[ColumnInfoNoDtype], bool] 347 | None = None, 348 dtypes: DType | DTypeMap | None = None, 349 eager: bool = False, 350 ) -> "ExcelSheet | pa.RecordBatch": 351 """Loads a sheet by index or name. 352 353 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 354 :param header_row: The index of the row containing the column labels, default index is 0. 355 If `None`, the sheet does not have any column labels. 356 Any rows before the `header_row` will be automatically skipped. 357 :param column_names: Overrides headers found in the document. 358 If `column_names` is used, `header_row` will be ignored. 359 :param n_rows: Specifies how many rows should be loaded. 360 If `None`, all rows are loaded 361 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 362 Any rows before the `header_row` are automatically skipped. 363 It means row indices are relative to data rows, not the sheet! 364 Can be one of: 365 - `int`: Skip this many rows after the header row 366 - `list[int]`: Skip specific row indices (0-based relative to data rows) 367 - `Callable[[int], bool]`: Function that receives row index (0-based 368 relative to data rows) and returns True to skip the row 369 - `None`: If `header_row` is None, skips empty rows at beginning 370 :param schema_sample_rows: Specifies how many rows should be used to determine 371 the dtype of a column. Cannot be 0. A specific dtype can be 372 enforced for some or all columns through the `dtypes` parameter. 373 If `None`, all rows will be used. 374 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 375 will try to coerce different dtypes in a column to the same one, 376 whereas `strict` will raise an error in case a column contains 377 several dtypes. Note that this only applies to columns whose dtype 378 is guessed, i.e. not specified via `dtypes`. 379 :param use_columns: Specifies the columns to use. Can either be: 380 - `None` to select all columns 381 - A list of strings and ints, the column names and/or indices 382 (starting at 0) 383 - A string, a comma separated list of Excel column letters and column 384 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 385 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 386 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 387 ranges (e.g. `":C"` to select columns from A to C). These can be 388 combined for "except" patterns (e.g. `":C,E:"` to select everything 389 except column D) 390 - A callable, a function that takes a column and returns a boolean 391 indicating whether the column should be used 392 :param dtypes: An optional dtype (for all columns) 393 or dict of dtypes with keys as column indices or names. 394 :param eager: Specifies whether the sheet should be loaded eagerly. 395 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 396 whereas `True` will load it eagerly via `pyarrow`. 397 398 Eager loading requires the `pyarrow` extra to be installed. 399 """ 400 sheet_or_rb = self._reader.load_sheet( 401 idx_or_name=idx_or_name, 402 header_row=header_row, 403 column_names=column_names, 404 skip_rows=skip_rows, 405 n_rows=n_rows, 406 schema_sample_rows=schema_sample_rows, 407 dtype_coercion=dtype_coercion, 408 use_columns=use_columns, 409 dtypes=dtypes, 410 eager=eager, 411 ) 412 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
Loads a sheet by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None
, the sheet does not have any column labels. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies which rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. It means row indices are relative to data rows, not the sheet! Can be one of:int
: Skip this many rows after the header rowlist[int]
: Skip specific row indices (0-based relative to data rows)Callable[[int], bool]
: Function that receives row index (0-based relative to data rows) and returns True to skip the rowNone
: Ifheader_row
is None, skips empty rows at beginning
- schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"
or"A,C,E:F"
, which would result inA,B,C,D,E
andA,C,E,F
). Also supports open-ended ranges (e.g."B:"
to select all columns from B onwards) and from-beginning ranges (e.g.":C"
to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"
to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
eager: Specifies whether the sheet should be loaded eagerly.
False
(default) will load the sheet lazily using thePyCapsule
interface, whereasTrue
will load it eagerly viapyarrow
.Eager loading requires the `pyarrow` extra to be installed.
414 def table_names(self, sheet_name: str | None = None) -> list[str]: 415 """The list of table names. 416 417 Will return an empty list if no tables are found. 418 419 :param sheet_name: If given, will limit the list to the given sheet, will be faster 420 too. 421 """ 422 return self._reader.table_names(sheet_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_name: If given, will limit the list to the given sheet, will be faster too.
464 def load_table( 465 self, 466 name: str, 467 *, 468 header_row: int | None = None, 469 column_names: list[str] | None = None, 470 skip_rows: int | None = None, 471 n_rows: int | None = None, 472 schema_sample_rows: int | None = 1_000, 473 dtype_coercion: Literal["coerce", "strict"] = "coerce", 474 use_columns: list[str] 475 | list[int] 476 | str 477 | Callable[[ColumnInfoNoDtype], bool] 478 | None = None, 479 dtypes: DType | DTypeMap | None = None, 480 eager: bool = False, 481 ) -> "ExcelTable | pa.RecordBatch": 482 """Loads a table by name. 483 484 :param name: The name of the table to load. 485 :param header_row: The index of the row containing the column labels. 486 If `None`, the table's column names will be used. 487 Any rows before the `header_row` will be automatically skipped. 488 :param column_names: Overrides headers found in the document. 489 If `column_names` is used, `header_row` will be ignored. 490 :param n_rows: Specifies how many rows should be loaded. 491 If `None`, all rows are loaded 492 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 493 Any rows before the `header_row` are automatically skipped. 494 If `header_row` is `None`, it skips the number of rows from the 495 start of the sheet. 496 :param schema_sample_rows: Specifies how many rows should be used to determine 497 the dtype of a column. Cannot be 0. A specific dtype can be 498 enforced for some or all columns through the `dtypes` parameter. 499 If `None`, all rows will be used. 500 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 501 will try to coerce different dtypes in a column to the same one, 502 whereas `strict` will raise an error in case a column contains 503 several dtypes. Note that this only applies to columns whose dtype 504 is guessed, i.e. not specified via `dtypes`. 505 :param use_columns: Specifies the columns to use. Can either be: 506 - `None` to select all columns 507 - A list of strings and ints, the column names and/or indices 508 (starting at 0) 509 - A string, a comma separated list of Excel column letters and column 510 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 511 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 512 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 513 ranges (e.g. `":C"` to select columns from A to C). These can be 514 combined for "except" patterns (e.g. `":C,E:"` to select everything 515 except column D) 516 - A callable, a function that takes a column and returns a boolean 517 indicating whether the column should be used 518 :param dtypes: An optional dtype (for all columns) 519 or dict of dtypes with keys as column indices or names. 520 :param eager: Specifies whether the table should be loaded eagerly. 521 `False` (default) will load the table lazily using the `PyCapsule` interface, 522 whereas `True` will load it eagerly via `pyarrow`. 523 524 Eager loading requires the `pyarrow` extra to be installed. 525 """ 526 if eager: 527 return self._reader.load_table( 528 name=name, 529 header_row=header_row, 530 column_names=column_names, 531 skip_rows=skip_rows, 532 n_rows=n_rows, 533 schema_sample_rows=schema_sample_rows, 534 dtype_coercion=dtype_coercion, 535 use_columns=use_columns, 536 dtypes=dtypes, 537 eager=True, 538 ) 539 else: 540 return ExcelTable( 541 self._reader.load_table( 542 name=name, 543 header_row=header_row, 544 column_names=column_names, 545 skip_rows=skip_rows, 546 n_rows=n_rows, 547 schema_sample_rows=schema_sample_rows, 548 dtype_coercion=dtype_coercion, 549 use_columns=use_columns, 550 dtypes=dtypes, 551 eager=False, 552 ) 553 )
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None
, the table's column names will be used. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"
or"A,C,E:F"
, which would result inA,B,C,D,E
andA,C,E,F
). Also supports open-ended ranges (e.g."B:"
to select all columns from B onwards) and from-beginning ranges (e.g.":C"
to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"
to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
eager: Specifies whether the table should be loaded eagerly.
False
(default) will load the table lazily using thePyCapsule
interface, whereasTrue
will load it eagerly viapyarrow
.Eager loading requires the `pyarrow` extra to be installed.
555 def load_sheet_eager( 556 self, 557 idx_or_name: int | str, 558 *, 559 header_row: int | None = 0, 560 column_names: list[str] | None = None, 561 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 562 n_rows: int | None = None, 563 schema_sample_rows: int | None = 1_000, 564 dtype_coercion: Literal["coerce", "strict"] = "coerce", 565 use_columns: list[str] | list[int] | str | None = None, 566 dtypes: DType | DTypeMap | None = None, 567 ) -> "pa.RecordBatch": 568 """Loads a sheet eagerly by index or name. 569 570 For xlsx files, this will be faster and more memory-efficient, as it will use 571 `worksheet_range_ref` under the hood, which returns borrowed types. 572 573 Refer to `load_sheet` for parameter documentation 574 575 Requires the `pyarrow` extra to be installed. 576 """ 577 return self._reader.load_sheet( 578 idx_or_name=idx_or_name, 579 header_row=header_row, 580 column_names=column_names, 581 skip_rows=skip_rows, 582 n_rows=n_rows, 583 schema_sample_rows=schema_sample_rows, 584 dtype_coercion=dtype_coercion, 585 use_columns=use_columns, 586 dtypes=dtypes, 587 eager=True, 588 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref
under the hood, which returns borrowed types.
Refer to load_sheet
for parameter documentation
Requires the pyarrow
extra to be installed.
590 def load_sheet_by_name( 591 self, 592 name: str, 593 *, 594 header_row: int | None = 0, 595 column_names: list[str] | None = None, 596 skip_rows: int | None = None, 597 n_rows: int | None = None, 598 schema_sample_rows: int | None = 1_000, 599 dtype_coercion: Literal["coerce", "strict"] = "coerce", 600 use_columns: list[str] 601 | list[int] 602 | str 603 | Callable[[ColumnInfoNoDtype], bool] 604 | None = None, 605 dtypes: DType | DTypeMap | None = None, 606 ) -> ExcelSheet: 607 """Loads a sheet by name. 608 609 Refer to `load_sheet` for parameter documentation 610 """ 611 return self.load_sheet( 612 name, 613 header_row=header_row, 614 column_names=column_names, 615 skip_rows=skip_rows, 616 n_rows=n_rows, 617 schema_sample_rows=schema_sample_rows, 618 dtype_coercion=dtype_coercion, 619 use_columns=use_columns, 620 dtypes=dtypes, 621 )
Loads a sheet by name.
Refer to load_sheet
for parameter documentation
623 def load_sheet_by_idx( 624 self, 625 idx: int, 626 *, 627 header_row: int | None = 0, 628 column_names: list[str] | None = None, 629 skip_rows: int | None = None, 630 n_rows: int | None = None, 631 schema_sample_rows: int | None = 1_000, 632 dtype_coercion: Literal["coerce", "strict"] = "coerce", 633 use_columns: list[str] 634 | list[int] 635 | str 636 | Callable[[ColumnInfoNoDtype], bool] 637 | None = None, 638 dtypes: DType | DTypeMap | None = None, 639 ) -> ExcelSheet: 640 """Loads a sheet by index. 641 642 Refer to `load_sheet` for parameter documentation 643 """ 644 return self.load_sheet( 645 idx, 646 header_row=header_row, 647 column_names=column_names, 648 skip_rows=skip_rows, 649 n_rows=n_rows, 650 schema_sample_rows=schema_sample_rows, 651 dtype_coercion=dtype_coercion, 652 use_columns=use_columns, 653 dtypes=dtypes, 654 )
Loads a sheet by index.
Refer to load_sheet
for parameter documentation
59class ExcelSheet: 60 """A class representing a single sheet in an Excel File""" 61 62 def __init__(self, sheet: _ExcelSheet) -> None: 63 self._sheet = sheet 64 65 @property 66 def name(self) -> str: 67 """The name of the sheet""" 68 return self._sheet.name 69 70 @property 71 def width(self) -> int: 72 """The sheet's width""" 73 return self._sheet.width 74 75 @property 76 def height(self) -> int: 77 """The sheet's height, with `skip_rows` and `nrows` applied""" 78 return self._sheet.height 79 80 @property 81 def total_height(self) -> int: 82 """The sheet's total height""" 83 return self._sheet.total_height 84 85 @property 86 def selected_columns(self) -> list[ColumnInfo]: 87 """The sheet's selected columns""" 88 return self._sheet.selected_columns 89 90 def available_columns(self) -> list[ColumnInfo]: 91 """The columns available for the given sheet""" 92 return self._sheet.available_columns() 93 94 @property 95 def specified_dtypes(self) -> DTypeMap | None: 96 """The dtypes specified for the sheet""" 97 return self._sheet.specified_dtypes 98 99 @property 100 def visible(self) -> SheetVisible: 101 """The visibility of the sheet""" 102 return self._sheet.visible 103 104 def to_arrow(self) -> "pa.RecordBatch": 105 """Converts the sheet to a pyarrow `RecordBatch` 106 107 Requires the `pyarrow` extra to be installed. 108 """ 109 if not _PYARROW_AVAILABLE: 110 raise ImportError( 111 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 112 ) 113 return self._sheet.to_arrow() 114 115 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 116 """Converts the sheet to a pyarrow `RecordBatch` with error information. 117 118 Stores the positions of any values that cannot be parsed as the specified type and were 119 therefore converted to None. 120 121 Requires the `pyarrow` extra to be installed. 122 """ 123 if not _PYARROW_AVAILABLE: 124 raise ImportError( 125 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 126 ) 127 rb, cell_errors = self._sheet.to_arrow_with_errors() 128 if not cell_errors.errors: 129 return (rb, None) 130 return (rb, cell_errors) 131 132 def to_pandas(self) -> "pd.DataFrame": 133 """Converts the sheet to a Pandas `DataFrame`. 134 135 Requires the `pandas` extra to be installed. 136 """ 137 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 138 # which we don't implement. Using pyarrow conversion for now. 139 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 140 return self.to_arrow().to_pandas() 141 142 def to_polars(self) -> "pl.DataFrame": 143 """Converts the sheet to a Polars `DataFrame`. 144 145 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 146 Requires the `polars` extra to be installed. 147 """ 148 import polars as pl 149 150 return pl.DataFrame(self) 151 152 def __arrow_c_schema__(self) -> object: 153 """Export the schema as an `ArrowSchema` `PyCapsule`. 154 155 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 156 157 The Arrow PyCapsule Interface enables zero-copy data exchange with 158 Arrow-compatible libraries without requiring PyArrow as a dependency. 159 """ 160 return self._sheet.__arrow_c_schema__() 161 162 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 163 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 164 165 The optional `requested_schema` parameter allows for potential schema conversion. 166 167 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 168 169 The Arrow PyCapsule Interface enables zero-copy data exchange with 170 Arrow-compatible libraries without requiring PyArrow as a dependency. 171 """ 172 return self._sheet.__arrow_c_array__(requested_schema) 173 174 def __repr__(self) -> str: 175 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
75 @property 76 def height(self) -> int: 77 """The sheet's height, with `skip_rows` and `nrows` applied""" 78 return self._sheet.height
The sheet's height, with skip_rows
and nrows
applied
80 @property 81 def total_height(self) -> int: 82 """The sheet's total height""" 83 return self._sheet.total_height
The sheet's total height
85 @property 86 def selected_columns(self) -> list[ColumnInfo]: 87 """The sheet's selected columns""" 88 return self._sheet.selected_columns
The sheet's selected columns
90 def available_columns(self) -> list[ColumnInfo]: 91 """The columns available for the given sheet""" 92 return self._sheet.available_columns()
The columns available for the given sheet
94 @property 95 def specified_dtypes(self) -> DTypeMap | None: 96 """The dtypes specified for the sheet""" 97 return self._sheet.specified_dtypes
The dtypes specified for the sheet
99 @property 100 def visible(self) -> SheetVisible: 101 """The visibility of the sheet""" 102 return self._sheet.visible
The visibility of the sheet
104 def to_arrow(self) -> "pa.RecordBatch": 105 """Converts the sheet to a pyarrow `RecordBatch` 106 107 Requires the `pyarrow` extra to be installed. 108 """ 109 if not _PYARROW_AVAILABLE: 110 raise ImportError( 111 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 112 ) 113 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
Requires the pyarrow
extra to be installed.
115 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 116 """Converts the sheet to a pyarrow `RecordBatch` with error information. 117 118 Stores the positions of any values that cannot be parsed as the specified type and were 119 therefore converted to None. 120 121 Requires the `pyarrow` extra to be installed. 122 """ 123 if not _PYARROW_AVAILABLE: 124 raise ImportError( 125 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 126 ) 127 rb, cell_errors = self._sheet.to_arrow_with_errors() 128 if not cell_errors.errors: 129 return (rb, None) 130 return (rb, cell_errors)
Converts the sheet to a pyarrow RecordBatch
with error information.
Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None.
Requires the pyarrow
extra to be installed.
132 def to_pandas(self) -> "pd.DataFrame": 133 """Converts the sheet to a Pandas `DataFrame`. 134 135 Requires the `pandas` extra to be installed. 136 """ 137 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 138 # which we don't implement. Using pyarrow conversion for now. 139 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 140 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame
.
Requires the pandas
extra to be installed.
142 def to_polars(self) -> "pl.DataFrame": 143 """Converts the sheet to a Polars `DataFrame`. 144 145 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 146 Requires the `polars` extra to be installed. 147 """ 148 import polars as pl 149 150 return pl.DataFrame(self)
Converts the sheet to a Polars DataFrame
.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the polars
extra to be installed.
178class ExcelTable: 179 """A class representing a single table in an Excel file""" 180 181 def __init__(self, table: _ExcelTable) -> None: 182 self._table = table 183 184 @property 185 def name(self) -> str: 186 """The name of the table""" 187 return self._table.name 188 189 @property 190 def sheet_name(self) -> str: 191 """The name of the sheet this table belongs to""" 192 return self._table.sheet_name 193 194 @property 195 def width(self) -> int: 196 """The table's width""" 197 return self._table.width 198 199 @property 200 def height(self) -> int: 201 """The table's height""" 202 return self._table.height 203 204 @property 205 def total_height(self) -> int: 206 """The table's total height""" 207 return self._table.total_height 208 209 @property 210 def offset(self) -> int: 211 """The table's offset before data starts""" 212 return self._table.offset 213 214 @property 215 def selected_columns(self) -> list[ColumnInfo]: 216 """The table's selected columns""" 217 return self._table.selected_columns 218 219 def available_columns(self) -> list[ColumnInfo]: 220 """The columns available for the given table""" 221 return self._table.available_columns() 222 223 @property 224 def specified_dtypes(self) -> DTypeMap | None: 225 """The dtypes specified for the table""" 226 return self._table.specified_dtypes 227 228 def to_arrow(self) -> "pa.RecordBatch": 229 """Converts the table to a pyarrow `RecordBatch` 230 231 Requires the `pyarrow` extra to be installed. 232 """ 233 if not _PYARROW_AVAILABLE: 234 raise ImportError( 235 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 236 ) 237 return self._table.to_arrow() 238 239 def to_pandas(self) -> "pd.DataFrame": 240 """Converts the table to a Pandas `DataFrame`. 241 242 Requires the `pandas` extra to be installed. 243 """ 244 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 245 # which we don't implement. Using pyarrow conversion for now. 246 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 247 return self.to_arrow().to_pandas() 248 249 def to_polars(self) -> "pl.DataFrame": 250 """Converts the table to a Polars `DataFrame`. 251 252 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 253 Requires the `polars` extra to be installed. 254 """ 255 import polars as pl 256 257 return pl.DataFrame(self) 258 259 def __arrow_c_schema__(self) -> object: 260 """Export the schema as an `ArrowSchema` `PyCapsule`. 261 262 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 263 264 The Arrow PyCapsule Interface enables zero-copy data exchange with 265 Arrow-compatible libraries without requiring PyArrow as a dependency. 266 """ 267 return self._table.__arrow_c_schema__() 268 269 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 270 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 271 272 The optional `requested_schema` parameter allows for potential schema conversion. 273 274 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 275 276 The Arrow PyCapsule Interface enables zero-copy data exchange with 277 Arrow-compatible libraries without requiring PyArrow as a dependency. 278 """ 279 return self._table.__arrow_c_array__(requested_schema)
A class representing a single table in an Excel file
184 @property 185 def name(self) -> str: 186 """The name of the table""" 187 return self._table.name
The name of the table
189 @property 190 def sheet_name(self) -> str: 191 """The name of the sheet this table belongs to""" 192 return self._table.sheet_name
The name of the sheet this table belongs to
199 @property 200 def height(self) -> int: 201 """The table's height""" 202 return self._table.height
The table's height
204 @property 205 def total_height(self) -> int: 206 """The table's total height""" 207 return self._table.total_height
The table's total height
209 @property 210 def offset(self) -> int: 211 """The table's offset before data starts""" 212 return self._table.offset
The table's offset before data starts
214 @property 215 def selected_columns(self) -> list[ColumnInfo]: 216 """The table's selected columns""" 217 return self._table.selected_columns
The table's selected columns
219 def available_columns(self) -> list[ColumnInfo]: 220 """The columns available for the given table""" 221 return self._table.available_columns()
The columns available for the given table
223 @property 224 def specified_dtypes(self) -> DTypeMap | None: 225 """The dtypes specified for the table""" 226 return self._table.specified_dtypes
The dtypes specified for the table
228 def to_arrow(self) -> "pa.RecordBatch": 229 """Converts the table to a pyarrow `RecordBatch` 230 231 Requires the `pyarrow` extra to be installed. 232 """ 233 if not _PYARROW_AVAILABLE: 234 raise ImportError( 235 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 236 ) 237 return self._table.to_arrow()
Converts the table to a pyarrow RecordBatch
Requires the pyarrow
extra to be installed.
239 def to_pandas(self) -> "pd.DataFrame": 240 """Converts the table to a Pandas `DataFrame`. 241 242 Requires the `pandas` extra to be installed. 243 """ 244 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 245 # which we don't implement. Using pyarrow conversion for now. 246 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 247 return self.to_arrow().to_pandas()
Converts the table to a Pandas DataFrame
.
Requires the pandas
extra to be installed.
249 def to_polars(self) -> "pl.DataFrame": 250 """Converts the table to a Polars `DataFrame`. 251 252 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 253 Requires the `polars` extra to be installed. 254 """ 255 import polars as pl 256 257 return pl.DataFrame(self)
Converts the table to a Polars DataFrame
.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the polars
extra to be installed.
Metadata about a single column in a sheet.
fastexcel.ColumnNameFrom
. How the name of the column was determined.
One of three possible values:
"provided"
: The column name was provided via theuse_columns
parameter"looked_up"
: The column name was looked up from the data found in the sheet"generated"
: The column name was generated from the column index, either becauseheader_row
wasNone
, or because it could not be looked up
fastexcel.DTypeFrom
. How the dtype of the column was determined.
One of three possible values:
"provided_by_index"
: The dtype was specified via the column index"provided_by_name"
: The dtype was specified via the column name"guessed"
: The dtype was determined from the content of the column
The base class for all fastexcel errors
Data for a given cell cannot be retrieved
calamine returned an error regarding the content of the cell
Generic calamine error
Sheet was not found
Column was not found
Generic arrow error
Provided parameters are invalid
Column contains an unsupported type combination