fastexcel
1from __future__ import annotations 2 3import sys 4import typing 5from typing import TYPE_CHECKING, Callable, Literal 6 7if sys.version_info < (3, 10): 8 from typing_extensions import TypeAlias 9else: 10 from typing import TypeAlias 11 12if TYPE_CHECKING: 13 import pandas as pd 14 import polars as pl 15 import pyarrow as pa 16 17from os.path import expanduser 18from pathlib import Path 19 20try: 21 import importlib.util 22 23 importlib.util.find_spec("pyarrow") 24 _PYARROW_AVAILABLE = True 25except ImportError: 26 _PYARROW_AVAILABLE = False 27 28from ._fastexcel import ( 29 ArrowError, 30 CalamineCellError, 31 CalamineError, 32 CannotRetrieveCellDataError, 33 CellError, 34 CellErrors, 35 ColumnInfo, 36 ColumnInfoNoDtype, 37 ColumnNotFoundError, 38 DefinedName, 39 FastExcelError, 40 InvalidParametersError, 41 SheetNotFoundError, 42 UnsupportedColumnTypeCombinationError, 43 __version__, 44 _ExcelReader, 45 _ExcelSheet, 46 _ExcelTable, 47) 48from ._fastexcel import read_excel as _read_excel 49 50DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 51DTypeMap: TypeAlias = "dict[str | int, DType]" 52ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 53DTypeFrom: TypeAlias = Literal[ 54 "provided_for_all", "provided_by_index", "provided_by_name", "guessed" 55] 56SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 57 58 59class ExcelSheet: 60 """A class representing a single sheet in an Excel File""" 61 62 def __init__(self, sheet: _ExcelSheet) -> None: 63 self._sheet = sheet 64 65 @property 66 def name(self) -> str: 67 """The name of the sheet""" 68 return self._sheet.name 69 70 @property 71 def width(self) -> int: 72 """The sheet's width""" 73 return self._sheet.width 74 75 @property 76 def height(self) -> int: 77 """The sheet's height, with `skip_rows` and `nrows` applied""" 78 return self._sheet.height 79 80 @property 81 def total_height(self) -> int: 82 """The sheet's total height""" 83 return self._sheet.total_height 84 85 @property 86 def selected_columns(self) -> list[ColumnInfo]: 87 """The sheet's selected columns""" 88 return self._sheet.selected_columns 89 90 def available_columns(self) -> list[ColumnInfo]: 91 """The columns available for the given sheet""" 92 return self._sheet.available_columns() 93 94 @property 95 def specified_dtypes(self) -> DTypeMap | None: 96 """The dtypes specified for the sheet""" 97 return self._sheet.specified_dtypes 98 99 @property 100 def visible(self) -> SheetVisible: 101 """The visibility of the sheet""" 102 return self._sheet.visible 103 104 def to_arrow(self) -> "pa.RecordBatch": 105 """Converts the sheet to a pyarrow `RecordBatch` 106 107 Requires the `pyarrow` extra to be installed. 108 """ 109 if not _PYARROW_AVAILABLE: 110 raise ImportError( 111 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 112 ) 113 return self._sheet.to_arrow() 114 115 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 116 """Converts the sheet to a pyarrow `RecordBatch` with error information. 117 118 Stores the positions of any values that cannot be parsed as the specified type and were 119 therefore converted to None. 120 121 Requires the `pyarrow` extra to be installed. 122 """ 123 if not _PYARROW_AVAILABLE: 124 raise ImportError( 125 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 126 ) 127 rb, cell_errors = self._sheet.to_arrow_with_errors() 128 if not cell_errors.errors: 129 return (rb, None) 130 return (rb, cell_errors) 131 132 def to_pandas(self) -> "pd.DataFrame": 133 """Converts the sheet to a Pandas `DataFrame`. 134 135 Requires the `pandas` extra to be installed. 136 """ 137 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 138 # which we don't implement. Using pyarrow conversion for now. 139 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 140 return self.to_arrow().to_pandas() 141 142 def to_polars(self) -> "pl.DataFrame": 143 """Converts the sheet to a Polars `DataFrame`. 144 145 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 146 Requires the `polars` extra to be installed. 147 """ 148 import polars as pl 149 150 return pl.DataFrame(self) 151 152 def __arrow_c_schema__(self) -> object: 153 """Export the schema as an `ArrowSchema` `PyCapsule`. 154 155 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 156 157 The Arrow PyCapsule Interface enables zero-copy data exchange with 158 Arrow-compatible libraries without requiring PyArrow as a dependency. 159 """ 160 return self._sheet.__arrow_c_schema__() 161 162 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 163 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 164 165 The optional `requested_schema` parameter allows for potential schema conversion. 166 167 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 168 169 The Arrow PyCapsule Interface enables zero-copy data exchange with 170 Arrow-compatible libraries without requiring PyArrow as a dependency. 171 """ 172 return self._sheet.__arrow_c_array__(requested_schema) 173 174 def __repr__(self) -> str: 175 return self._sheet.__repr__() 176 177 178class ExcelTable: 179 """A class representing a single table in an Excel file""" 180 181 def __init__(self, table: _ExcelTable) -> None: 182 self._table = table 183 184 @property 185 def name(self) -> str: 186 """The name of the table""" 187 return self._table.name 188 189 @property 190 def sheet_name(self) -> str: 191 """The name of the sheet this table belongs to""" 192 return self._table.sheet_name 193 194 @property 195 def width(self) -> int: 196 """The table's width""" 197 return self._table.width 198 199 @property 200 def height(self) -> int: 201 """The table's height""" 202 return self._table.height 203 204 @property 205 def total_height(self) -> int: 206 """The table's total height""" 207 return self._table.total_height 208 209 @property 210 def offset(self) -> int: 211 """The table's offset before data starts""" 212 return self._table.offset 213 214 @property 215 def selected_columns(self) -> list[ColumnInfo]: 216 """The table's selected columns""" 217 return self._table.selected_columns 218 219 def available_columns(self) -> list[ColumnInfo]: 220 """The columns available for the given table""" 221 return self._table.available_columns() 222 223 @property 224 def specified_dtypes(self) -> DTypeMap | None: 225 """The dtypes specified for the table""" 226 return self._table.specified_dtypes 227 228 def to_arrow(self) -> "pa.RecordBatch": 229 """Converts the table to a pyarrow `RecordBatch` 230 231 Requires the `pyarrow` extra to be installed. 232 """ 233 if not _PYARROW_AVAILABLE: 234 raise ImportError( 235 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 236 ) 237 return self._table.to_arrow() 238 239 def to_pandas(self) -> "pd.DataFrame": 240 """Converts the table to a Pandas `DataFrame`. 241 242 Requires the `pandas` extra to be installed. 243 """ 244 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 245 # which we don't implement. Using pyarrow conversion for now. 246 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 247 return self.to_arrow().to_pandas() 248 249 def to_polars(self) -> "pl.DataFrame": 250 """Converts the table to a Polars `DataFrame`. 251 252 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 253 Requires the `polars` extra to be installed. 254 """ 255 import polars as pl 256 257 return pl.DataFrame(self) 258 259 def __arrow_c_schema__(self) -> object: 260 """Export the schema as an `ArrowSchema` `PyCapsule`. 261 262 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 263 264 The Arrow PyCapsule Interface enables zero-copy data exchange with 265 Arrow-compatible libraries without requiring PyArrow as a dependency. 266 """ 267 return self._table.__arrow_c_schema__() 268 269 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 270 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 271 272 The optional `requested_schema` parameter allows for potential schema conversion. 273 274 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 275 276 The Arrow PyCapsule Interface enables zero-copy data exchange with 277 Arrow-compatible libraries without requiring PyArrow as a dependency. 278 """ 279 return self._table.__arrow_c_array__(requested_schema) 280 281 282class ExcelReader: 283 """A class representing an open Excel file and allowing to read its sheets""" 284 285 def __init__(self, reader: _ExcelReader) -> None: 286 self._reader = reader 287 288 @property 289 def sheet_names(self) -> list[str]: 290 """The list of sheet names""" 291 return self._reader.sheet_names 292 293 @typing.overload 294 def load_sheet( 295 self, 296 idx_or_name: int | str, 297 *, 298 header_row: int | None = 0, 299 column_names: list[str] | None = None, 300 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 301 n_rows: int | None = None, 302 schema_sample_rows: int | None = 1_000, 303 dtype_coercion: Literal["coerce", "strict"] = "coerce", 304 use_columns: list[str] 305 | list[int] 306 | str 307 | Callable[[ColumnInfoNoDtype], bool] 308 | None = None, 309 dtypes: DType | DTypeMap | None = None, 310 eager: Literal[False] = ..., 311 ) -> ExcelSheet: ... 312 313 @typing.overload 314 def load_sheet( 315 self, 316 idx_or_name: int | str, 317 *, 318 header_row: int | None = 0, 319 column_names: list[str] | None = None, 320 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 321 n_rows: int | None = None, 322 schema_sample_rows: int | None = 1_000, 323 dtype_coercion: Literal["coerce", "strict"] = "coerce", 324 use_columns: list[str] 325 | list[int] 326 | str 327 | Callable[[ColumnInfoNoDtype], bool] 328 | None = None, 329 dtypes: DType | DTypeMap | None = None, 330 eager: Literal[True] = ..., 331 ) -> "pa.RecordBatch": ... 332 333 def load_sheet( 334 self, 335 idx_or_name: int | str, 336 *, 337 header_row: int | None = 0, 338 column_names: list[str] | None = None, 339 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 340 n_rows: int | None = None, 341 schema_sample_rows: int | None = 1_000, 342 dtype_coercion: Literal["coerce", "strict"] = "coerce", 343 use_columns: list[str] 344 | list[int] 345 | str 346 | Callable[[ColumnInfoNoDtype], bool] 347 | None = None, 348 dtypes: DType | DTypeMap | None = None, 349 eager: bool = False, 350 ) -> "ExcelSheet | pa.RecordBatch": 351 """Loads a sheet by index or name. 352 353 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 354 :param header_row: The index of the row containing the column labels, default index is 0. 355 If `None`, the sheet does not have any column labels. 356 Any rows before the `header_row` will be automatically skipped. 357 :param column_names: Overrides headers found in the document. 358 If `column_names` is used, `header_row` will be ignored. 359 :param n_rows: Specifies how many rows should be loaded. 360 If `None`, all rows are loaded 361 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 362 Any rows before the `header_row` are automatically skipped. 363 It means row indices are relative to data rows, not the sheet! 364 Can be one of: 365 - `int`: Skip this many rows after the header row 366 - `list[int]`: Skip specific row indices (0-based relative to data rows) 367 - `Callable[[int], bool]`: Function that receives row index (0-based 368 relative to data rows) and returns True to skip the row 369 - `None`: If `header_row` is None, skips empty rows at beginning 370 :param schema_sample_rows: Specifies how many rows should be used to determine 371 the dtype of a column. Cannot be 0. A specific dtype can be 372 enforced for some or all columns through the `dtypes` parameter. 373 If `None`, all rows will be used. 374 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 375 will try to coerce different dtypes in a column to the same one, 376 whereas `strict` will raise an error in case a column contains 377 several dtypes. Note that this only applies to columns whose dtype 378 is guessed, i.e. not specified via `dtypes`. 379 :param use_columns: Specifies the columns to use. Can either be: 380 - `None` to select all columns 381 - A list of strings and ints, the column names and/or indices 382 (starting at 0) 383 - A string, a comma separated list of Excel column letters and column 384 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 385 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 386 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 387 ranges (e.g. `":C"` to select columns from A to C). These can be 388 combined for "except" patterns (e.g. `":C,E:"` to select everything 389 except column D) 390 - A callable, a function that takes a column and returns a boolean 391 indicating whether the column should be used 392 :param dtypes: An optional dtype (for all columns) 393 or dict of dtypes with keys as column indices or names. 394 :param eager: Specifies whether the sheet should be loaded eagerly. 395 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 396 whereas `True` will load it eagerly via `pyarrow`. 397 398 Eager loading requires the `pyarrow` extra to be installed. 399 """ 400 sheet_or_rb = self._reader.load_sheet( 401 idx_or_name=idx_or_name, 402 header_row=header_row, 403 column_names=column_names, 404 skip_rows=skip_rows, 405 n_rows=n_rows, 406 schema_sample_rows=schema_sample_rows, 407 dtype_coercion=dtype_coercion, 408 use_columns=use_columns, 409 dtypes=dtypes, 410 eager=eager, 411 ) 412 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb) 413 414 def table_names(self, sheet_name: str | None = None) -> list[str]: 415 """The list of table names. 416 417 Will return an empty list if no tables are found. 418 419 :param sheet_name: If given, will limit the list to the given sheet, will be faster 420 too. 421 """ 422 return self._reader.table_names(sheet_name) 423 424 def defined_names(self) -> list[DefinedName]: 425 """The list of defined names (named ranges) in the workbook. 426 427 Returns a list of DefinedName objects with 'name' and 'formula' attributes. 428 The formula is a string representation of the range or expression. 429 430 Will return an empty list if no defined names are found. 431 """ 432 return self._reader.defined_names() 433 434 @typing.overload 435 def load_table( 436 self, 437 name: str, 438 *, 439 header_row: int | None = None, 440 column_names: list[str] | None = None, 441 skip_rows: int | None = None, 442 n_rows: int | None = None, 443 schema_sample_rows: int | None = 1_000, 444 dtype_coercion: Literal["coerce", "strict"] = "coerce", 445 use_columns: list[str] 446 | list[int] 447 | str 448 | Callable[[ColumnInfoNoDtype], bool] 449 | None = None, 450 dtypes: DType | DTypeMap | None = None, 451 eager: Literal[False] = ..., 452 ) -> ExcelTable: ... 453 454 @typing.overload 455 def load_table( 456 self, 457 name: str, 458 *, 459 header_row: int | None = None, 460 column_names: list[str] | None = None, 461 skip_rows: int | None = None, 462 n_rows: int | None = None, 463 schema_sample_rows: int | None = 1_000, 464 dtype_coercion: Literal["coerce", "strict"] = "coerce", 465 use_columns: list[str] 466 | list[int] 467 | str 468 | Callable[[ColumnInfoNoDtype], bool] 469 | None = None, 470 dtypes: DType | DTypeMap | None = None, 471 eager: Literal[True] = ..., 472 ) -> "pa.RecordBatch": ... 473 474 def load_table( 475 self, 476 name: str, 477 *, 478 header_row: int | None = None, 479 column_names: list[str] | None = None, 480 skip_rows: int | None = None, 481 n_rows: int | None = None, 482 schema_sample_rows: int | None = 1_000, 483 dtype_coercion: Literal["coerce", "strict"] = "coerce", 484 use_columns: list[str] 485 | list[int] 486 | str 487 | Callable[[ColumnInfoNoDtype], bool] 488 | None = None, 489 dtypes: DType | DTypeMap | None = None, 490 eager: bool = False, 491 ) -> "ExcelTable | pa.RecordBatch": 492 """Loads a table by name. 493 494 :param name: The name of the table to load. 495 :param header_row: The index of the row containing the column labels. 496 If `None`, the table's column names will be used. 497 Any rows before the `header_row` will be automatically skipped. 498 :param column_names: Overrides headers found in the document. 499 If `column_names` is used, `header_row` will be ignored. 500 :param n_rows: Specifies how many rows should be loaded. 501 If `None`, all rows are loaded 502 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 503 Any rows before the `header_row` are automatically skipped. 504 If `header_row` is `None`, it skips the number of rows from the 505 start of the sheet. 506 :param schema_sample_rows: Specifies how many rows should be used to determine 507 the dtype of a column. Cannot be 0. A specific dtype can be 508 enforced for some or all columns through the `dtypes` parameter. 509 If `None`, all rows will be used. 510 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 511 will try to coerce different dtypes in a column to the same one, 512 whereas `strict` will raise an error in case a column contains 513 several dtypes. Note that this only applies to columns whose dtype 514 is guessed, i.e. not specified via `dtypes`. 515 :param use_columns: Specifies the columns to use. Can either be: 516 - `None` to select all columns 517 - A list of strings and ints, the column names and/or indices 518 (starting at 0) 519 - A string, a comma separated list of Excel column letters and column 520 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 521 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 522 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 523 ranges (e.g. `":C"` to select columns from A to C). These can be 524 combined for "except" patterns (e.g. `":C,E:"` to select everything 525 except column D) 526 - A callable, a function that takes a column and returns a boolean 527 indicating whether the column should be used 528 :param dtypes: An optional dtype (for all columns) 529 or dict of dtypes with keys as column indices or names. 530 :param eager: Specifies whether the table should be loaded eagerly. 531 `False` (default) will load the table lazily using the `PyCapsule` interface, 532 whereas `True` will load it eagerly via `pyarrow`. 533 534 Eager loading requires the `pyarrow` extra to be installed. 535 """ 536 if eager: 537 return self._reader.load_table( 538 name=name, 539 header_row=header_row, 540 column_names=column_names, 541 skip_rows=skip_rows, 542 n_rows=n_rows, 543 schema_sample_rows=schema_sample_rows, 544 dtype_coercion=dtype_coercion, 545 use_columns=use_columns, 546 dtypes=dtypes, 547 eager=True, 548 ) 549 else: 550 return ExcelTable( 551 self._reader.load_table( 552 name=name, 553 header_row=header_row, 554 column_names=column_names, 555 skip_rows=skip_rows, 556 n_rows=n_rows, 557 schema_sample_rows=schema_sample_rows, 558 dtype_coercion=dtype_coercion, 559 use_columns=use_columns, 560 dtypes=dtypes, 561 eager=False, 562 ) 563 ) 564 565 def load_sheet_eager( 566 self, 567 idx_or_name: int | str, 568 *, 569 header_row: int | None = 0, 570 column_names: list[str] | None = None, 571 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 572 n_rows: int | None = None, 573 schema_sample_rows: int | None = 1_000, 574 dtype_coercion: Literal["coerce", "strict"] = "coerce", 575 use_columns: list[str] | list[int] | str | None = None, 576 dtypes: DType | DTypeMap | None = None, 577 ) -> "pa.RecordBatch": 578 """Loads a sheet eagerly by index or name. 579 580 For xlsx files, this will be faster and more memory-efficient, as it will use 581 `worksheet_range_ref` under the hood, which returns borrowed types. 582 583 Refer to `load_sheet` for parameter documentation 584 585 Requires the `pyarrow` extra to be installed. 586 """ 587 return self._reader.load_sheet( 588 idx_or_name=idx_or_name, 589 header_row=header_row, 590 column_names=column_names, 591 skip_rows=skip_rows, 592 n_rows=n_rows, 593 schema_sample_rows=schema_sample_rows, 594 dtype_coercion=dtype_coercion, 595 use_columns=use_columns, 596 dtypes=dtypes, 597 eager=True, 598 ) 599 600 def load_sheet_by_name( 601 self, 602 name: str, 603 *, 604 header_row: int | None = 0, 605 column_names: list[str] | None = None, 606 skip_rows: int | None = None, 607 n_rows: int | None = None, 608 schema_sample_rows: int | None = 1_000, 609 dtype_coercion: Literal["coerce", "strict"] = "coerce", 610 use_columns: list[str] 611 | list[int] 612 | str 613 | Callable[[ColumnInfoNoDtype], bool] 614 | None = None, 615 dtypes: DType | DTypeMap | None = None, 616 ) -> ExcelSheet: 617 """Loads a sheet by name. 618 619 Refer to `load_sheet` for parameter documentation 620 """ 621 return self.load_sheet( 622 name, 623 header_row=header_row, 624 column_names=column_names, 625 skip_rows=skip_rows, 626 n_rows=n_rows, 627 schema_sample_rows=schema_sample_rows, 628 dtype_coercion=dtype_coercion, 629 use_columns=use_columns, 630 dtypes=dtypes, 631 ) 632 633 def load_sheet_by_idx( 634 self, 635 idx: int, 636 *, 637 header_row: int | None = 0, 638 column_names: list[str] | None = None, 639 skip_rows: int | None = None, 640 n_rows: int | None = None, 641 schema_sample_rows: int | None = 1_000, 642 dtype_coercion: Literal["coerce", "strict"] = "coerce", 643 use_columns: list[str] 644 | list[int] 645 | str 646 | Callable[[ColumnInfoNoDtype], bool] 647 | None = None, 648 dtypes: DType | DTypeMap | None = None, 649 ) -> ExcelSheet: 650 """Loads a sheet by index. 651 652 Refer to `load_sheet` for parameter documentation 653 """ 654 return self.load_sheet( 655 idx, 656 header_row=header_row, 657 column_names=column_names, 658 skip_rows=skip_rows, 659 n_rows=n_rows, 660 schema_sample_rows=schema_sample_rows, 661 dtype_coercion=dtype_coercion, 662 use_columns=use_columns, 663 dtypes=dtypes, 664 ) 665 666 def __repr__(self) -> str: 667 return self._reader.__repr__() 668 669 670def read_excel(source: Path | str | bytes) -> ExcelReader: 671 """Opens and loads an excel file. 672 673 :param source: The path to a file or its content as bytes 674 """ 675 if isinstance(source, (str, Path)): 676 source = expanduser(source) 677 return ExcelReader(_read_excel(source)) 678 679 680__all__ = ( 681 # version 682 "__version__", 683 # main entrypoint 684 "read_excel", 685 # Python types 686 "DType", 687 "DTypeMap", 688 # Excel reader 689 "ExcelReader", 690 # Excel sheet 691 "ExcelSheet", 692 # Excel table 693 "ExcelTable", 694 # Column metadata 695 "DTypeFrom", 696 "ColumnNameFrom", 697 "ColumnInfo", 698 # Defined names 699 "DefinedName", 700 # Parse error information 701 "CellError", 702 "CellErrors", 703 # Exceptions 704 "FastExcelError", 705 "CannotRetrieveCellDataError", 706 "CalamineCellError", 707 "CalamineError", 708 "SheetNotFoundError", 709 "ColumnNotFoundError", 710 "ArrowError", 711 "InvalidParametersError", 712 "UnsupportedColumnTypeCombinationError", 713)
671def read_excel(source: Path | str | bytes) -> ExcelReader: 672 """Opens and loads an excel file. 673 674 :param source: The path to a file or its content as bytes 675 """ 676 if isinstance(source, (str, Path)): 677 source = expanduser(source) 678 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
283class ExcelReader: 284 """A class representing an open Excel file and allowing to read its sheets""" 285 286 def __init__(self, reader: _ExcelReader) -> None: 287 self._reader = reader 288 289 @property 290 def sheet_names(self) -> list[str]: 291 """The list of sheet names""" 292 return self._reader.sheet_names 293 294 @typing.overload 295 def load_sheet( 296 self, 297 idx_or_name: int | str, 298 *, 299 header_row: int | None = 0, 300 column_names: list[str] | None = None, 301 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 302 n_rows: int | None = None, 303 schema_sample_rows: int | None = 1_000, 304 dtype_coercion: Literal["coerce", "strict"] = "coerce", 305 use_columns: list[str] 306 | list[int] 307 | str 308 | Callable[[ColumnInfoNoDtype], bool] 309 | None = None, 310 dtypes: DType | DTypeMap | None = None, 311 eager: Literal[False] = ..., 312 ) -> ExcelSheet: ... 313 314 @typing.overload 315 def load_sheet( 316 self, 317 idx_or_name: int | str, 318 *, 319 header_row: int | None = 0, 320 column_names: list[str] | None = None, 321 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 322 n_rows: int | None = None, 323 schema_sample_rows: int | None = 1_000, 324 dtype_coercion: Literal["coerce", "strict"] = "coerce", 325 use_columns: list[str] 326 | list[int] 327 | str 328 | Callable[[ColumnInfoNoDtype], bool] 329 | None = None, 330 dtypes: DType | DTypeMap | None = None, 331 eager: Literal[True] = ..., 332 ) -> "pa.RecordBatch": ... 333 334 def load_sheet( 335 self, 336 idx_or_name: int | str, 337 *, 338 header_row: int | None = 0, 339 column_names: list[str] | None = None, 340 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 341 n_rows: int | None = None, 342 schema_sample_rows: int | None = 1_000, 343 dtype_coercion: Literal["coerce", "strict"] = "coerce", 344 use_columns: list[str] 345 | list[int] 346 | str 347 | Callable[[ColumnInfoNoDtype], bool] 348 | None = None, 349 dtypes: DType | DTypeMap | None = None, 350 eager: bool = False, 351 ) -> "ExcelSheet | pa.RecordBatch": 352 """Loads a sheet by index or name. 353 354 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 355 :param header_row: The index of the row containing the column labels, default index is 0. 356 If `None`, the sheet does not have any column labels. 357 Any rows before the `header_row` will be automatically skipped. 358 :param column_names: Overrides headers found in the document. 359 If `column_names` is used, `header_row` will be ignored. 360 :param n_rows: Specifies how many rows should be loaded. 361 If `None`, all rows are loaded 362 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 363 Any rows before the `header_row` are automatically skipped. 364 It means row indices are relative to data rows, not the sheet! 365 Can be one of: 366 - `int`: Skip this many rows after the header row 367 - `list[int]`: Skip specific row indices (0-based relative to data rows) 368 - `Callable[[int], bool]`: Function that receives row index (0-based 369 relative to data rows) and returns True to skip the row 370 - `None`: If `header_row` is None, skips empty rows at beginning 371 :param schema_sample_rows: Specifies how many rows should be used to determine 372 the dtype of a column. Cannot be 0. A specific dtype can be 373 enforced for some or all columns through the `dtypes` parameter. 374 If `None`, all rows will be used. 375 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 376 will try to coerce different dtypes in a column to the same one, 377 whereas `strict` will raise an error in case a column contains 378 several dtypes. Note that this only applies to columns whose dtype 379 is guessed, i.e. not specified via `dtypes`. 380 :param use_columns: Specifies the columns to use. Can either be: 381 - `None` to select all columns 382 - A list of strings and ints, the column names and/or indices 383 (starting at 0) 384 - A string, a comma separated list of Excel column letters and column 385 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 386 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 387 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 388 ranges (e.g. `":C"` to select columns from A to C). These can be 389 combined for "except" patterns (e.g. `":C,E:"` to select everything 390 except column D) 391 - A callable, a function that takes a column and returns a boolean 392 indicating whether the column should be used 393 :param dtypes: An optional dtype (for all columns) 394 or dict of dtypes with keys as column indices or names. 395 :param eager: Specifies whether the sheet should be loaded eagerly. 396 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 397 whereas `True` will load it eagerly via `pyarrow`. 398 399 Eager loading requires the `pyarrow` extra to be installed. 400 """ 401 sheet_or_rb = self._reader.load_sheet( 402 idx_or_name=idx_or_name, 403 header_row=header_row, 404 column_names=column_names, 405 skip_rows=skip_rows, 406 n_rows=n_rows, 407 schema_sample_rows=schema_sample_rows, 408 dtype_coercion=dtype_coercion, 409 use_columns=use_columns, 410 dtypes=dtypes, 411 eager=eager, 412 ) 413 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb) 414 415 def table_names(self, sheet_name: str | None = None) -> list[str]: 416 """The list of table names. 417 418 Will return an empty list if no tables are found. 419 420 :param sheet_name: If given, will limit the list to the given sheet, will be faster 421 too. 422 """ 423 return self._reader.table_names(sheet_name) 424 425 def defined_names(self) -> list[DefinedName]: 426 """The list of defined names (named ranges) in the workbook. 427 428 Returns a list of DefinedName objects with 'name' and 'formula' attributes. 429 The formula is a string representation of the range or expression. 430 431 Will return an empty list if no defined names are found. 432 """ 433 return self._reader.defined_names() 434 435 @typing.overload 436 def load_table( 437 self, 438 name: str, 439 *, 440 header_row: int | None = None, 441 column_names: list[str] | None = None, 442 skip_rows: int | None = None, 443 n_rows: int | None = None, 444 schema_sample_rows: int | None = 1_000, 445 dtype_coercion: Literal["coerce", "strict"] = "coerce", 446 use_columns: list[str] 447 | list[int] 448 | str 449 | Callable[[ColumnInfoNoDtype], bool] 450 | None = None, 451 dtypes: DType | DTypeMap | None = None, 452 eager: Literal[False] = ..., 453 ) -> ExcelTable: ... 454 455 @typing.overload 456 def load_table( 457 self, 458 name: str, 459 *, 460 header_row: int | None = None, 461 column_names: list[str] | None = None, 462 skip_rows: int | None = None, 463 n_rows: int | None = None, 464 schema_sample_rows: int | None = 1_000, 465 dtype_coercion: Literal["coerce", "strict"] = "coerce", 466 use_columns: list[str] 467 | list[int] 468 | str 469 | Callable[[ColumnInfoNoDtype], bool] 470 | None = None, 471 dtypes: DType | DTypeMap | None = None, 472 eager: Literal[True] = ..., 473 ) -> "pa.RecordBatch": ... 474 475 def load_table( 476 self, 477 name: str, 478 *, 479 header_row: int | None = None, 480 column_names: list[str] | None = None, 481 skip_rows: int | None = None, 482 n_rows: int | None = None, 483 schema_sample_rows: int | None = 1_000, 484 dtype_coercion: Literal["coerce", "strict"] = "coerce", 485 use_columns: list[str] 486 | list[int] 487 | str 488 | Callable[[ColumnInfoNoDtype], bool] 489 | None = None, 490 dtypes: DType | DTypeMap | None = None, 491 eager: bool = False, 492 ) -> "ExcelTable | pa.RecordBatch": 493 """Loads a table by name. 494 495 :param name: The name of the table to load. 496 :param header_row: The index of the row containing the column labels. 497 If `None`, the table's column names will be used. 498 Any rows before the `header_row` will be automatically skipped. 499 :param column_names: Overrides headers found in the document. 500 If `column_names` is used, `header_row` will be ignored. 501 :param n_rows: Specifies how many rows should be loaded. 502 If `None`, all rows are loaded 503 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 504 Any rows before the `header_row` are automatically skipped. 505 If `header_row` is `None`, it skips the number of rows from the 506 start of the sheet. 507 :param schema_sample_rows: Specifies how many rows should be used to determine 508 the dtype of a column. Cannot be 0. A specific dtype can be 509 enforced for some or all columns through the `dtypes` parameter. 510 If `None`, all rows will be used. 511 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 512 will try to coerce different dtypes in a column to the same one, 513 whereas `strict` will raise an error in case a column contains 514 several dtypes. Note that this only applies to columns whose dtype 515 is guessed, i.e. not specified via `dtypes`. 516 :param use_columns: Specifies the columns to use. Can either be: 517 - `None` to select all columns 518 - A list of strings and ints, the column names and/or indices 519 (starting at 0) 520 - A string, a comma separated list of Excel column letters and column 521 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 522 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 523 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 524 ranges (e.g. `":C"` to select columns from A to C). These can be 525 combined for "except" patterns (e.g. `":C,E:"` to select everything 526 except column D) 527 - A callable, a function that takes a column and returns a boolean 528 indicating whether the column should be used 529 :param dtypes: An optional dtype (for all columns) 530 or dict of dtypes with keys as column indices or names. 531 :param eager: Specifies whether the table should be loaded eagerly. 532 `False` (default) will load the table lazily using the `PyCapsule` interface, 533 whereas `True` will load it eagerly via `pyarrow`. 534 535 Eager loading requires the `pyarrow` extra to be installed. 536 """ 537 if eager: 538 return self._reader.load_table( 539 name=name, 540 header_row=header_row, 541 column_names=column_names, 542 skip_rows=skip_rows, 543 n_rows=n_rows, 544 schema_sample_rows=schema_sample_rows, 545 dtype_coercion=dtype_coercion, 546 use_columns=use_columns, 547 dtypes=dtypes, 548 eager=True, 549 ) 550 else: 551 return ExcelTable( 552 self._reader.load_table( 553 name=name, 554 header_row=header_row, 555 column_names=column_names, 556 skip_rows=skip_rows, 557 n_rows=n_rows, 558 schema_sample_rows=schema_sample_rows, 559 dtype_coercion=dtype_coercion, 560 use_columns=use_columns, 561 dtypes=dtypes, 562 eager=False, 563 ) 564 ) 565 566 def load_sheet_eager( 567 self, 568 idx_or_name: int | str, 569 *, 570 header_row: int | None = 0, 571 column_names: list[str] | None = None, 572 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 573 n_rows: int | None = None, 574 schema_sample_rows: int | None = 1_000, 575 dtype_coercion: Literal["coerce", "strict"] = "coerce", 576 use_columns: list[str] | list[int] | str | None = None, 577 dtypes: DType | DTypeMap | None = None, 578 ) -> "pa.RecordBatch": 579 """Loads a sheet eagerly by index or name. 580 581 For xlsx files, this will be faster and more memory-efficient, as it will use 582 `worksheet_range_ref` under the hood, which returns borrowed types. 583 584 Refer to `load_sheet` for parameter documentation 585 586 Requires the `pyarrow` extra to be installed. 587 """ 588 return self._reader.load_sheet( 589 idx_or_name=idx_or_name, 590 header_row=header_row, 591 column_names=column_names, 592 skip_rows=skip_rows, 593 n_rows=n_rows, 594 schema_sample_rows=schema_sample_rows, 595 dtype_coercion=dtype_coercion, 596 use_columns=use_columns, 597 dtypes=dtypes, 598 eager=True, 599 ) 600 601 def load_sheet_by_name( 602 self, 603 name: str, 604 *, 605 header_row: int | None = 0, 606 column_names: list[str] | None = None, 607 skip_rows: int | None = None, 608 n_rows: int | None = None, 609 schema_sample_rows: int | None = 1_000, 610 dtype_coercion: Literal["coerce", "strict"] = "coerce", 611 use_columns: list[str] 612 | list[int] 613 | str 614 | Callable[[ColumnInfoNoDtype], bool] 615 | None = None, 616 dtypes: DType | DTypeMap | None = None, 617 ) -> ExcelSheet: 618 """Loads a sheet by name. 619 620 Refer to `load_sheet` for parameter documentation 621 """ 622 return self.load_sheet( 623 name, 624 header_row=header_row, 625 column_names=column_names, 626 skip_rows=skip_rows, 627 n_rows=n_rows, 628 schema_sample_rows=schema_sample_rows, 629 dtype_coercion=dtype_coercion, 630 use_columns=use_columns, 631 dtypes=dtypes, 632 ) 633 634 def load_sheet_by_idx( 635 self, 636 idx: int, 637 *, 638 header_row: int | None = 0, 639 column_names: list[str] | None = None, 640 skip_rows: int | None = None, 641 n_rows: int | None = None, 642 schema_sample_rows: int | None = 1_000, 643 dtype_coercion: Literal["coerce", "strict"] = "coerce", 644 use_columns: list[str] 645 | list[int] 646 | str 647 | Callable[[ColumnInfoNoDtype], bool] 648 | None = None, 649 dtypes: DType | DTypeMap | None = None, 650 ) -> ExcelSheet: 651 """Loads a sheet by index. 652 653 Refer to `load_sheet` for parameter documentation 654 """ 655 return self.load_sheet( 656 idx, 657 header_row=header_row, 658 column_names=column_names, 659 skip_rows=skip_rows, 660 n_rows=n_rows, 661 schema_sample_rows=schema_sample_rows, 662 dtype_coercion=dtype_coercion, 663 use_columns=use_columns, 664 dtypes=dtypes, 665 ) 666 667 def __repr__(self) -> str: 668 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
289 @property 290 def sheet_names(self) -> list[str]: 291 """The list of sheet names""" 292 return self._reader.sheet_names
The list of sheet names
334 def load_sheet( 335 self, 336 idx_or_name: int | str, 337 *, 338 header_row: int | None = 0, 339 column_names: list[str] | None = None, 340 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 341 n_rows: int | None = None, 342 schema_sample_rows: int | None = 1_000, 343 dtype_coercion: Literal["coerce", "strict"] = "coerce", 344 use_columns: list[str] 345 | list[int] 346 | str 347 | Callable[[ColumnInfoNoDtype], bool] 348 | None = None, 349 dtypes: DType | DTypeMap | None = None, 350 eager: bool = False, 351 ) -> "ExcelSheet | pa.RecordBatch": 352 """Loads a sheet by index or name. 353 354 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 355 :param header_row: The index of the row containing the column labels, default index is 0. 356 If `None`, the sheet does not have any column labels. 357 Any rows before the `header_row` will be automatically skipped. 358 :param column_names: Overrides headers found in the document. 359 If `column_names` is used, `header_row` will be ignored. 360 :param n_rows: Specifies how many rows should be loaded. 361 If `None`, all rows are loaded 362 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 363 Any rows before the `header_row` are automatically skipped. 364 It means row indices are relative to data rows, not the sheet! 365 Can be one of: 366 - `int`: Skip this many rows after the header row 367 - `list[int]`: Skip specific row indices (0-based relative to data rows) 368 - `Callable[[int], bool]`: Function that receives row index (0-based 369 relative to data rows) and returns True to skip the row 370 - `None`: If `header_row` is None, skips empty rows at beginning 371 :param schema_sample_rows: Specifies how many rows should be used to determine 372 the dtype of a column. Cannot be 0. A specific dtype can be 373 enforced for some or all columns through the `dtypes` parameter. 374 If `None`, all rows will be used. 375 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 376 will try to coerce different dtypes in a column to the same one, 377 whereas `strict` will raise an error in case a column contains 378 several dtypes. Note that this only applies to columns whose dtype 379 is guessed, i.e. not specified via `dtypes`. 380 :param use_columns: Specifies the columns to use. Can either be: 381 - `None` to select all columns 382 - A list of strings and ints, the column names and/or indices 383 (starting at 0) 384 - A string, a comma separated list of Excel column letters and column 385 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 386 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 387 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 388 ranges (e.g. `":C"` to select columns from A to C). These can be 389 combined for "except" patterns (e.g. `":C,E:"` to select everything 390 except column D) 391 - A callable, a function that takes a column and returns a boolean 392 indicating whether the column should be used 393 :param dtypes: An optional dtype (for all columns) 394 or dict of dtypes with keys as column indices or names. 395 :param eager: Specifies whether the sheet should be loaded eagerly. 396 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 397 whereas `True` will load it eagerly via `pyarrow`. 398 399 Eager loading requires the `pyarrow` extra to be installed. 400 """ 401 sheet_or_rb = self._reader.load_sheet( 402 idx_or_name=idx_or_name, 403 header_row=header_row, 404 column_names=column_names, 405 skip_rows=skip_rows, 406 n_rows=n_rows, 407 schema_sample_rows=schema_sample_rows, 408 dtype_coercion=dtype_coercion, 409 use_columns=use_columns, 410 dtypes=dtypes, 411 eager=eager, 412 ) 413 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
Loads a sheet by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None, the sheet does not have any column labels. Any rows before theheader_rowwill be automatically skipped. - column_names: Overrides headers found in the document.
If
column_namesis used,header_rowwill be ignored. - n_rows: Specifies how many rows should be loaded.
If
None, all rows are loaded - skip_rows: Specifies which rows should be skipped after the
header_row. Any rows before theheader_roware automatically skipped. It means row indices are relative to data rows, not the sheet! Can be one of:int: Skip this many rows after the header rowlist[int]: Skip specific row indices (0-based relative to data rows)Callable[[int], bool]: Function that receives row index (0-based relative to data rows) and returns True to skip the rowNone: Ifheader_rowis None, skips empty rows at beginning
- schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypesparameter. IfNone, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce(the default) will try to coerce different dtypes in a column to the same one, whereasstrictwill raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes. - use_columns: Specifies the columns to use. Can either be:
Noneto select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"or"A,C,E:F", which would result inA,B,C,D,EandA,C,E,F). Also supports open-ended ranges (e.g."B:"to select all columns from B onwards) and from-beginning ranges (e.g.":C"to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
eager: Specifies whether the sheet should be loaded eagerly.
False(default) will load the sheet lazily using thePyCapsuleinterface, whereasTruewill load it eagerly viapyarrow.Eager loading requires the `pyarrow` extra to be installed.
415 def table_names(self, sheet_name: str | None = None) -> list[str]: 416 """The list of table names. 417 418 Will return an empty list if no tables are found. 419 420 :param sheet_name: If given, will limit the list to the given sheet, will be faster 421 too. 422 """ 423 return self._reader.table_names(sheet_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_name: If given, will limit the list to the given sheet, will be faster too.
425 def defined_names(self) -> list[DefinedName]: 426 """The list of defined names (named ranges) in the workbook. 427 428 Returns a list of DefinedName objects with 'name' and 'formula' attributes. 429 The formula is a string representation of the range or expression. 430 431 Will return an empty list if no defined names are found. 432 """ 433 return self._reader.defined_names()
The list of defined names (named ranges) in the workbook.
Returns a list of DefinedName objects with 'name' and 'formula' attributes. The formula is a string representation of the range or expression.
Will return an empty list if no defined names are found.
475 def load_table( 476 self, 477 name: str, 478 *, 479 header_row: int | None = None, 480 column_names: list[str] | None = None, 481 skip_rows: int | None = None, 482 n_rows: int | None = None, 483 schema_sample_rows: int | None = 1_000, 484 dtype_coercion: Literal["coerce", "strict"] = "coerce", 485 use_columns: list[str] 486 | list[int] 487 | str 488 | Callable[[ColumnInfoNoDtype], bool] 489 | None = None, 490 dtypes: DType | DTypeMap | None = None, 491 eager: bool = False, 492 ) -> "ExcelTable | pa.RecordBatch": 493 """Loads a table by name. 494 495 :param name: The name of the table to load. 496 :param header_row: The index of the row containing the column labels. 497 If `None`, the table's column names will be used. 498 Any rows before the `header_row` will be automatically skipped. 499 :param column_names: Overrides headers found in the document. 500 If `column_names` is used, `header_row` will be ignored. 501 :param n_rows: Specifies how many rows should be loaded. 502 If `None`, all rows are loaded 503 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 504 Any rows before the `header_row` are automatically skipped. 505 If `header_row` is `None`, it skips the number of rows from the 506 start of the sheet. 507 :param schema_sample_rows: Specifies how many rows should be used to determine 508 the dtype of a column. Cannot be 0. A specific dtype can be 509 enforced for some or all columns through the `dtypes` parameter. 510 If `None`, all rows will be used. 511 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 512 will try to coerce different dtypes in a column to the same one, 513 whereas `strict` will raise an error in case a column contains 514 several dtypes. Note that this only applies to columns whose dtype 515 is guessed, i.e. not specified via `dtypes`. 516 :param use_columns: Specifies the columns to use. Can either be: 517 - `None` to select all columns 518 - A list of strings and ints, the column names and/or indices 519 (starting at 0) 520 - A string, a comma separated list of Excel column letters and column 521 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 522 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 523 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 524 ranges (e.g. `":C"` to select columns from A to C). These can be 525 combined for "except" patterns (e.g. `":C,E:"` to select everything 526 except column D) 527 - A callable, a function that takes a column and returns a boolean 528 indicating whether the column should be used 529 :param dtypes: An optional dtype (for all columns) 530 or dict of dtypes with keys as column indices or names. 531 :param eager: Specifies whether the table should be loaded eagerly. 532 `False` (default) will load the table lazily using the `PyCapsule` interface, 533 whereas `True` will load it eagerly via `pyarrow`. 534 535 Eager loading requires the `pyarrow` extra to be installed. 536 """ 537 if eager: 538 return self._reader.load_table( 539 name=name, 540 header_row=header_row, 541 column_names=column_names, 542 skip_rows=skip_rows, 543 n_rows=n_rows, 544 schema_sample_rows=schema_sample_rows, 545 dtype_coercion=dtype_coercion, 546 use_columns=use_columns, 547 dtypes=dtypes, 548 eager=True, 549 ) 550 else: 551 return ExcelTable( 552 self._reader.load_table( 553 name=name, 554 header_row=header_row, 555 column_names=column_names, 556 skip_rows=skip_rows, 557 n_rows=n_rows, 558 schema_sample_rows=schema_sample_rows, 559 dtype_coercion=dtype_coercion, 560 use_columns=use_columns, 561 dtypes=dtypes, 562 eager=False, 563 ) 564 )
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None, the table's column names will be used. Any rows before theheader_rowwill be automatically skipped. - column_names: Overrides headers found in the document.
If
column_namesis used,header_rowwill be ignored. - n_rows: Specifies how many rows should be loaded.
If
None, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row. Any rows before theheader_roware automatically skipped. Ifheader_rowisNone, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypesparameter. IfNone, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce(the default) will try to coerce different dtypes in a column to the same one, whereasstrictwill raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes. - use_columns: Specifies the columns to use. Can either be:
Noneto select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"or"A,C,E:F", which would result inA,B,C,D,EandA,C,E,F). Also supports open-ended ranges (e.g."B:"to select all columns from B onwards) and from-beginning ranges (e.g.":C"to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
eager: Specifies whether the table should be loaded eagerly.
False(default) will load the table lazily using thePyCapsuleinterface, whereasTruewill load it eagerly viapyarrow.Eager loading requires the `pyarrow` extra to be installed.
566 def load_sheet_eager( 567 self, 568 idx_or_name: int | str, 569 *, 570 header_row: int | None = 0, 571 column_names: list[str] | None = None, 572 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 573 n_rows: int | None = None, 574 schema_sample_rows: int | None = 1_000, 575 dtype_coercion: Literal["coerce", "strict"] = "coerce", 576 use_columns: list[str] | list[int] | str | None = None, 577 dtypes: DType | DTypeMap | None = None, 578 ) -> "pa.RecordBatch": 579 """Loads a sheet eagerly by index or name. 580 581 For xlsx files, this will be faster and more memory-efficient, as it will use 582 `worksheet_range_ref` under the hood, which returns borrowed types. 583 584 Refer to `load_sheet` for parameter documentation 585 586 Requires the `pyarrow` extra to be installed. 587 """ 588 return self._reader.load_sheet( 589 idx_or_name=idx_or_name, 590 header_row=header_row, 591 column_names=column_names, 592 skip_rows=skip_rows, 593 n_rows=n_rows, 594 schema_sample_rows=schema_sample_rows, 595 dtype_coercion=dtype_coercion, 596 use_columns=use_columns, 597 dtypes=dtypes, 598 eager=True, 599 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref under the hood, which returns borrowed types.
Refer to load_sheet for parameter documentation
Requires the pyarrow extra to be installed.
601 def load_sheet_by_name( 602 self, 603 name: str, 604 *, 605 header_row: int | None = 0, 606 column_names: list[str] | None = None, 607 skip_rows: int | None = None, 608 n_rows: int | None = None, 609 schema_sample_rows: int | None = 1_000, 610 dtype_coercion: Literal["coerce", "strict"] = "coerce", 611 use_columns: list[str] 612 | list[int] 613 | str 614 | Callable[[ColumnInfoNoDtype], bool] 615 | None = None, 616 dtypes: DType | DTypeMap | None = None, 617 ) -> ExcelSheet: 618 """Loads a sheet by name. 619 620 Refer to `load_sheet` for parameter documentation 621 """ 622 return self.load_sheet( 623 name, 624 header_row=header_row, 625 column_names=column_names, 626 skip_rows=skip_rows, 627 n_rows=n_rows, 628 schema_sample_rows=schema_sample_rows, 629 dtype_coercion=dtype_coercion, 630 use_columns=use_columns, 631 dtypes=dtypes, 632 )
Loads a sheet by name.
Refer to load_sheet for parameter documentation
634 def load_sheet_by_idx( 635 self, 636 idx: int, 637 *, 638 header_row: int | None = 0, 639 column_names: list[str] | None = None, 640 skip_rows: int | None = None, 641 n_rows: int | None = None, 642 schema_sample_rows: int | None = 1_000, 643 dtype_coercion: Literal["coerce", "strict"] = "coerce", 644 use_columns: list[str] 645 | list[int] 646 | str 647 | Callable[[ColumnInfoNoDtype], bool] 648 | None = None, 649 dtypes: DType | DTypeMap | None = None, 650 ) -> ExcelSheet: 651 """Loads a sheet by index. 652 653 Refer to `load_sheet` for parameter documentation 654 """ 655 return self.load_sheet( 656 idx, 657 header_row=header_row, 658 column_names=column_names, 659 skip_rows=skip_rows, 660 n_rows=n_rows, 661 schema_sample_rows=schema_sample_rows, 662 dtype_coercion=dtype_coercion, 663 use_columns=use_columns, 664 dtypes=dtypes, 665 )
Loads a sheet by index.
Refer to load_sheet for parameter documentation
60class ExcelSheet: 61 """A class representing a single sheet in an Excel File""" 62 63 def __init__(self, sheet: _ExcelSheet) -> None: 64 self._sheet = sheet 65 66 @property 67 def name(self) -> str: 68 """The name of the sheet""" 69 return self._sheet.name 70 71 @property 72 def width(self) -> int: 73 """The sheet's width""" 74 return self._sheet.width 75 76 @property 77 def height(self) -> int: 78 """The sheet's height, with `skip_rows` and `nrows` applied""" 79 return self._sheet.height 80 81 @property 82 def total_height(self) -> int: 83 """The sheet's total height""" 84 return self._sheet.total_height 85 86 @property 87 def selected_columns(self) -> list[ColumnInfo]: 88 """The sheet's selected columns""" 89 return self._sheet.selected_columns 90 91 def available_columns(self) -> list[ColumnInfo]: 92 """The columns available for the given sheet""" 93 return self._sheet.available_columns() 94 95 @property 96 def specified_dtypes(self) -> DTypeMap | None: 97 """The dtypes specified for the sheet""" 98 return self._sheet.specified_dtypes 99 100 @property 101 def visible(self) -> SheetVisible: 102 """The visibility of the sheet""" 103 return self._sheet.visible 104 105 def to_arrow(self) -> "pa.RecordBatch": 106 """Converts the sheet to a pyarrow `RecordBatch` 107 108 Requires the `pyarrow` extra to be installed. 109 """ 110 if not _PYARROW_AVAILABLE: 111 raise ImportError( 112 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 113 ) 114 return self._sheet.to_arrow() 115 116 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 117 """Converts the sheet to a pyarrow `RecordBatch` with error information. 118 119 Stores the positions of any values that cannot be parsed as the specified type and were 120 therefore converted to None. 121 122 Requires the `pyarrow` extra to be installed. 123 """ 124 if not _PYARROW_AVAILABLE: 125 raise ImportError( 126 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 127 ) 128 rb, cell_errors = self._sheet.to_arrow_with_errors() 129 if not cell_errors.errors: 130 return (rb, None) 131 return (rb, cell_errors) 132 133 def to_pandas(self) -> "pd.DataFrame": 134 """Converts the sheet to a Pandas `DataFrame`. 135 136 Requires the `pandas` extra to be installed. 137 """ 138 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 139 # which we don't implement. Using pyarrow conversion for now. 140 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 141 return self.to_arrow().to_pandas() 142 143 def to_polars(self) -> "pl.DataFrame": 144 """Converts the sheet to a Polars `DataFrame`. 145 146 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 147 Requires the `polars` extra to be installed. 148 """ 149 import polars as pl 150 151 return pl.DataFrame(self) 152 153 def __arrow_c_schema__(self) -> object: 154 """Export the schema as an `ArrowSchema` `PyCapsule`. 155 156 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 157 158 The Arrow PyCapsule Interface enables zero-copy data exchange with 159 Arrow-compatible libraries without requiring PyArrow as a dependency. 160 """ 161 return self._sheet.__arrow_c_schema__() 162 163 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 164 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 165 166 The optional `requested_schema` parameter allows for potential schema conversion. 167 168 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 169 170 The Arrow PyCapsule Interface enables zero-copy data exchange with 171 Arrow-compatible libraries without requiring PyArrow as a dependency. 172 """ 173 return self._sheet.__arrow_c_array__(requested_schema) 174 175 def __repr__(self) -> str: 176 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
76 @property 77 def height(self) -> int: 78 """The sheet's height, with `skip_rows` and `nrows` applied""" 79 return self._sheet.height
The sheet's height, with skip_rows and nrows applied
81 @property 82 def total_height(self) -> int: 83 """The sheet's total height""" 84 return self._sheet.total_height
The sheet's total height
86 @property 87 def selected_columns(self) -> list[ColumnInfo]: 88 """The sheet's selected columns""" 89 return self._sheet.selected_columns
The sheet's selected columns
91 def available_columns(self) -> list[ColumnInfo]: 92 """The columns available for the given sheet""" 93 return self._sheet.available_columns()
The columns available for the given sheet
95 @property 96 def specified_dtypes(self) -> DTypeMap | None: 97 """The dtypes specified for the sheet""" 98 return self._sheet.specified_dtypes
The dtypes specified for the sheet
100 @property 101 def visible(self) -> SheetVisible: 102 """The visibility of the sheet""" 103 return self._sheet.visible
The visibility of the sheet
105 def to_arrow(self) -> "pa.RecordBatch": 106 """Converts the sheet to a pyarrow `RecordBatch` 107 108 Requires the `pyarrow` extra to be installed. 109 """ 110 if not _PYARROW_AVAILABLE: 111 raise ImportError( 112 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 113 ) 114 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
Requires the pyarrow extra to be installed.
116 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 117 """Converts the sheet to a pyarrow `RecordBatch` with error information. 118 119 Stores the positions of any values that cannot be parsed as the specified type and were 120 therefore converted to None. 121 122 Requires the `pyarrow` extra to be installed. 123 """ 124 if not _PYARROW_AVAILABLE: 125 raise ImportError( 126 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 127 ) 128 rb, cell_errors = self._sheet.to_arrow_with_errors() 129 if not cell_errors.errors: 130 return (rb, None) 131 return (rb, cell_errors)
Converts the sheet to a pyarrow RecordBatch with error information.
Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None.
Requires the pyarrow extra to be installed.
133 def to_pandas(self) -> "pd.DataFrame": 134 """Converts the sheet to a Pandas `DataFrame`. 135 136 Requires the `pandas` extra to be installed. 137 """ 138 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 139 # which we don't implement. Using pyarrow conversion for now. 140 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 141 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame.
Requires the pandas extra to be installed.
143 def to_polars(self) -> "pl.DataFrame": 144 """Converts the sheet to a Polars `DataFrame`. 145 146 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 147 Requires the `polars` extra to be installed. 148 """ 149 import polars as pl 150 151 return pl.DataFrame(self)
Converts the sheet to a Polars DataFrame.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the polars extra to be installed.
179class ExcelTable: 180 """A class representing a single table in an Excel file""" 181 182 def __init__(self, table: _ExcelTable) -> None: 183 self._table = table 184 185 @property 186 def name(self) -> str: 187 """The name of the table""" 188 return self._table.name 189 190 @property 191 def sheet_name(self) -> str: 192 """The name of the sheet this table belongs to""" 193 return self._table.sheet_name 194 195 @property 196 def width(self) -> int: 197 """The table's width""" 198 return self._table.width 199 200 @property 201 def height(self) -> int: 202 """The table's height""" 203 return self._table.height 204 205 @property 206 def total_height(self) -> int: 207 """The table's total height""" 208 return self._table.total_height 209 210 @property 211 def offset(self) -> int: 212 """The table's offset before data starts""" 213 return self._table.offset 214 215 @property 216 def selected_columns(self) -> list[ColumnInfo]: 217 """The table's selected columns""" 218 return self._table.selected_columns 219 220 def available_columns(self) -> list[ColumnInfo]: 221 """The columns available for the given table""" 222 return self._table.available_columns() 223 224 @property 225 def specified_dtypes(self) -> DTypeMap | None: 226 """The dtypes specified for the table""" 227 return self._table.specified_dtypes 228 229 def to_arrow(self) -> "pa.RecordBatch": 230 """Converts the table to a pyarrow `RecordBatch` 231 232 Requires the `pyarrow` extra to be installed. 233 """ 234 if not _PYARROW_AVAILABLE: 235 raise ImportError( 236 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 237 ) 238 return self._table.to_arrow() 239 240 def to_pandas(self) -> "pd.DataFrame": 241 """Converts the table to a Pandas `DataFrame`. 242 243 Requires the `pandas` extra to be installed. 244 """ 245 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 246 # which we don't implement. Using pyarrow conversion for now. 247 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 248 return self.to_arrow().to_pandas() 249 250 def to_polars(self) -> "pl.DataFrame": 251 """Converts the table to a Polars `DataFrame`. 252 253 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 254 Requires the `polars` extra to be installed. 255 """ 256 import polars as pl 257 258 return pl.DataFrame(self) 259 260 def __arrow_c_schema__(self) -> object: 261 """Export the schema as an `ArrowSchema` `PyCapsule`. 262 263 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 264 265 The Arrow PyCapsule Interface enables zero-copy data exchange with 266 Arrow-compatible libraries without requiring PyArrow as a dependency. 267 """ 268 return self._table.__arrow_c_schema__() 269 270 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 271 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 272 273 The optional `requested_schema` parameter allows for potential schema conversion. 274 275 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 276 277 The Arrow PyCapsule Interface enables zero-copy data exchange with 278 Arrow-compatible libraries without requiring PyArrow as a dependency. 279 """ 280 return self._table.__arrow_c_array__(requested_schema)
A class representing a single table in an Excel file
185 @property 186 def name(self) -> str: 187 """The name of the table""" 188 return self._table.name
The name of the table
190 @property 191 def sheet_name(self) -> str: 192 """The name of the sheet this table belongs to""" 193 return self._table.sheet_name
The name of the sheet this table belongs to
200 @property 201 def height(self) -> int: 202 """The table's height""" 203 return self._table.height
The table's height
205 @property 206 def total_height(self) -> int: 207 """The table's total height""" 208 return self._table.total_height
The table's total height
210 @property 211 def offset(self) -> int: 212 """The table's offset before data starts""" 213 return self._table.offset
The table's offset before data starts
215 @property 216 def selected_columns(self) -> list[ColumnInfo]: 217 """The table's selected columns""" 218 return self._table.selected_columns
The table's selected columns
220 def available_columns(self) -> list[ColumnInfo]: 221 """The columns available for the given table""" 222 return self._table.available_columns()
The columns available for the given table
224 @property 225 def specified_dtypes(self) -> DTypeMap | None: 226 """The dtypes specified for the table""" 227 return self._table.specified_dtypes
The dtypes specified for the table
229 def to_arrow(self) -> "pa.RecordBatch": 230 """Converts the table to a pyarrow `RecordBatch` 231 232 Requires the `pyarrow` extra to be installed. 233 """ 234 if not _PYARROW_AVAILABLE: 235 raise ImportError( 236 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 237 ) 238 return self._table.to_arrow()
Converts the table to a pyarrow RecordBatch
Requires the pyarrow extra to be installed.
240 def to_pandas(self) -> "pd.DataFrame": 241 """Converts the table to a Pandas `DataFrame`. 242 243 Requires the `pandas` extra to be installed. 244 """ 245 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 246 # which we don't implement. Using pyarrow conversion for now. 247 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 248 return self.to_arrow().to_pandas()
Converts the table to a Pandas DataFrame.
Requires the pandas extra to be installed.
250 def to_polars(self) -> "pl.DataFrame": 251 """Converts the table to a Polars `DataFrame`. 252 253 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 254 Requires the `polars` extra to be installed. 255 """ 256 import polars as pl 257 258 return pl.DataFrame(self)
Converts the table to a Polars DataFrame.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the polars extra to be installed.
Metadata about a single column in a sheet.
fastexcel.DTypeFrom. How the dtype of the column was determined.
One of three possible values:
"provided_by_index": The dtype was specified via the column index"provided_by_name": The dtype was specified via the column name"guessed": The dtype was determined from the content of the column
fastexcel.ColumnNameFrom. How the name of the column was determined.
One of three possible values:
"provided": The column name was provided via theuse_columnsparameter"looked_up": The column name was looked up from the data found in the sheet"generated": The column name was generated from the column index, either becauseheader_rowwasNone, or because it could not be looked up
The base class for all fastexcel errors
Data for a given cell cannot be retrieved
calamine returned an error regarding the content of the cell
Generic calamine error
Sheet was not found
Column was not found
Generic arrow error
Provided parameters are invalid
Column contains an unsupported type combination