fastexcel
1from __future__ import annotations 2 3import typing 4from collections.abc import Callable 5from typing import TYPE_CHECKING, Literal, TypeAlias 6 7if TYPE_CHECKING: 8 import pandas as pd 9 import polars as pl 10 import pyarrow as pa 11 12from os.path import expanduser 13from pathlib import Path 14 15try: 16 import importlib.util 17 18 importlib.util.find_spec("pyarrow") 19 _PYARROW_AVAILABLE = True 20except ImportError: 21 _PYARROW_AVAILABLE = False 22 23from ._fastexcel import ( 24 ArrowError, 25 CalamineCellError, 26 CalamineError, 27 CannotRetrieveCellDataError, 28 CellError, 29 CellErrors, 30 ColumnInfo, 31 ColumnInfoNoDtype, 32 ColumnNotFoundError, 33 DefinedName, 34 FastExcelError, 35 InvalidParametersError, 36 SheetNotFoundError, 37 UnsupportedColumnTypeCombinationError, 38 __version__, 39 _ExcelReader, 40 _ExcelSheet, 41 _ExcelTable, 42) 43from ._fastexcel import read_excel as _read_excel 44 45DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 46DTypeMap: TypeAlias = "dict[str | int, DType]" 47ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 48DTypeFrom: TypeAlias = Literal[ 49 "provided_for_all", "provided_by_index", "provided_by_name", "guessed" 50] 51SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 52 53 54class ExcelSheet: 55 """A class representing a single sheet in an Excel File""" 56 57 def __init__(self, sheet: _ExcelSheet) -> None: 58 self._sheet = sheet 59 60 @property 61 def name(self) -> str: 62 """The name of the sheet""" 63 return self._sheet.name 64 65 @property 66 def width(self) -> int: 67 """The sheet's width""" 68 return self._sheet.width 69 70 @property 71 def height(self) -> int: 72 """The sheet's height, with `skip_rows` and `nrows` applied""" 73 return self._sheet.height 74 75 @property 76 def total_height(self) -> int: 77 """The sheet's total height""" 78 return self._sheet.total_height 79 80 @property 81 def selected_columns(self) -> list[ColumnInfo]: 82 """The sheet's selected columns""" 83 return self._sheet.selected_columns 84 85 def available_columns(self) -> list[ColumnInfo]: 86 """The columns available for the given sheet""" 87 return self._sheet.available_columns() 88 89 @property 90 def specified_dtypes(self) -> DTypeMap | None: 91 """The dtypes specified for the sheet""" 92 return self._sheet.specified_dtypes 93 94 @property 95 def visible(self) -> SheetVisible: 96 """The visibility of the sheet""" 97 return self._sheet.visible 98 99 def to_arrow(self) -> pa.RecordBatch: 100 """Converts the sheet to a pyarrow `RecordBatch` 101 102 Requires the `pyarrow` extra to be installed. 103 """ 104 if not _PYARROW_AVAILABLE: 105 raise ImportError( 106 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 107 ) 108 return self._sheet.to_arrow() 109 110 def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]: 111 """Converts the sheet to a pyarrow `RecordBatch` with error information. 112 113 Stores the positions of any values that cannot be parsed as the specified type and were 114 therefore converted to None. 115 116 Requires the `pyarrow` extra to be installed. 117 """ 118 if not _PYARROW_AVAILABLE: 119 raise ImportError( 120 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 121 ) 122 rb, cell_errors = self._sheet.to_arrow_with_errors() 123 if not cell_errors.errors: 124 return (rb, None) 125 return (rb, cell_errors) 126 127 def to_pandas(self) -> pd.DataFrame: 128 """Converts the sheet to a Pandas `DataFrame`. 129 130 Requires the `pandas` extra to be installed. 131 """ 132 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 133 # which we don't implement. Using pyarrow conversion for now. 134 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 135 return self.to_arrow().to_pandas() 136 137 def to_polars(self) -> pl.DataFrame: 138 """Converts the sheet to a Polars `DataFrame`. 139 140 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 141 Requires the `polars` extra to be installed. 142 """ 143 import polars as pl 144 145 return pl.DataFrame(self) 146 147 def __arrow_c_schema__(self) -> object: 148 """Export the schema as an `ArrowSchema` `PyCapsule`. 149 150 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 151 152 The Arrow PyCapsule Interface enables zero-copy data exchange with 153 Arrow-compatible libraries without requiring PyArrow as a dependency. 154 """ 155 return self._sheet.__arrow_c_schema__() 156 157 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 158 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 159 160 The optional `requested_schema` parameter allows for potential schema conversion. 161 162 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 163 164 The Arrow PyCapsule Interface enables zero-copy data exchange with 165 Arrow-compatible libraries without requiring PyArrow as a dependency. 166 """ 167 return self._sheet.__arrow_c_array__(requested_schema) 168 169 def __repr__(self) -> str: 170 return self._sheet.__repr__() 171 172 173class ExcelTable: 174 """A class representing a single table in an Excel file""" 175 176 def __init__(self, table: _ExcelTable) -> None: 177 self._table = table 178 179 @property 180 def name(self) -> str: 181 """The name of the table""" 182 return self._table.name 183 184 @property 185 def sheet_name(self) -> str: 186 """The name of the sheet this table belongs to""" 187 return self._table.sheet_name 188 189 @property 190 def width(self) -> int: 191 """The table's width""" 192 return self._table.width 193 194 @property 195 def height(self) -> int: 196 """The table's height""" 197 return self._table.height 198 199 @property 200 def total_height(self) -> int: 201 """The table's total height""" 202 return self._table.total_height 203 204 @property 205 def offset(self) -> int: 206 """The table's offset before data starts""" 207 return self._table.offset 208 209 @property 210 def selected_columns(self) -> list[ColumnInfo]: 211 """The table's selected columns""" 212 return self._table.selected_columns 213 214 def available_columns(self) -> list[ColumnInfo]: 215 """The columns available for the given table""" 216 return self._table.available_columns() 217 218 @property 219 def specified_dtypes(self) -> DTypeMap | None: 220 """The dtypes specified for the table""" 221 return self._table.specified_dtypes 222 223 def to_arrow(self) -> pa.RecordBatch: 224 """Converts the table to a pyarrow `RecordBatch` 225 226 Requires the `pyarrow` extra to be installed. 227 """ 228 if not _PYARROW_AVAILABLE: 229 raise ImportError( 230 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 231 ) 232 return self._table.to_arrow() 233 234 def to_pandas(self) -> pd.DataFrame: 235 """Converts the table to a Pandas `DataFrame`. 236 237 Requires the `pandas` extra to be installed. 238 """ 239 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 240 # which we don't implement. Using pyarrow conversion for now. 241 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 242 return self.to_arrow().to_pandas() 243 244 def to_polars(self) -> pl.DataFrame: 245 """Converts the table to a Polars `DataFrame`. 246 247 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 248 Requires the `polars` extra to be installed. 249 """ 250 import polars as pl 251 252 return pl.DataFrame(self) 253 254 def __arrow_c_schema__(self) -> object: 255 """Export the schema as an `ArrowSchema` `PyCapsule`. 256 257 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 258 259 The Arrow PyCapsule Interface enables zero-copy data exchange with 260 Arrow-compatible libraries without requiring PyArrow as a dependency. 261 """ 262 return self._table.__arrow_c_schema__() 263 264 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 265 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 266 267 The optional `requested_schema` parameter allows for potential schema conversion. 268 269 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 270 271 The Arrow PyCapsule Interface enables zero-copy data exchange with 272 Arrow-compatible libraries without requiring PyArrow as a dependency. 273 """ 274 return self._table.__arrow_c_array__(requested_schema) 275 276 277class ExcelReader: 278 """A class representing an open Excel file and allowing to read its sheets""" 279 280 def __init__(self, reader: _ExcelReader) -> None: 281 self._reader = reader 282 283 @property 284 def sheet_names(self) -> list[str]: 285 """The list of sheet names""" 286 return self._reader.sheet_names 287 288 @typing.overload 289 def load_sheet( 290 self, 291 idx_or_name: int | str, 292 *, 293 header_row: int | None = 0, 294 column_names: list[str] | None = None, 295 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 296 n_rows: int | None = None, 297 schema_sample_rows: int | None = 1_000, 298 dtype_coercion: Literal["coerce", "strict"] = "coerce", 299 use_columns: list[str] 300 | list[int] 301 | str 302 | Callable[[ColumnInfoNoDtype], bool] 303 | None = None, 304 dtypes: DType | DTypeMap | None = None, 305 eager: Literal[False] = ..., 306 skip_whitespace_tail_rows: bool = False, 307 whitespace_as_null: bool = False, 308 ) -> ExcelSheet: ... 309 310 @typing.overload 311 def load_sheet( 312 self, 313 idx_or_name: int | str, 314 *, 315 header_row: int | None = 0, 316 column_names: list[str] | None = None, 317 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 318 n_rows: int | None = None, 319 schema_sample_rows: int | None = 1_000, 320 dtype_coercion: Literal["coerce", "strict"] = "coerce", 321 use_columns: list[str] 322 | list[int] 323 | str 324 | Callable[[ColumnInfoNoDtype], bool] 325 | None = None, 326 dtypes: DType | DTypeMap | None = None, 327 eager: Literal[True] = ..., 328 skip_whitespace_tail_rows: bool = False, 329 whitespace_as_null: bool = False, 330 ) -> pa.RecordBatch: ... 331 332 def load_sheet( 333 self, 334 idx_or_name: int | str, 335 *, 336 header_row: int | None = 0, 337 column_names: list[str] | None = None, 338 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 339 n_rows: int | None = None, 340 schema_sample_rows: int | None = 1_000, 341 dtype_coercion: Literal["coerce", "strict"] = "coerce", 342 use_columns: list[str] 343 | list[int] 344 | str 345 | Callable[[ColumnInfoNoDtype], bool] 346 | None = None, 347 dtypes: DType | DTypeMap | None = None, 348 eager: bool = False, 349 skip_whitespace_tail_rows: bool = False, 350 whitespace_as_null: bool = False, 351 ) -> ExcelSheet | pa.RecordBatch: 352 """Loads a sheet by index or name. 353 354 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 355 :param header_row: The index of the row containing the column labels, default index is 0. 356 If `None`, the sheet does not have any column labels. 357 Any rows before the `header_row` will be automatically skipped. 358 :param column_names: Overrides headers found in the document. 359 If `column_names` is used, `header_row` will be ignored. 360 :param n_rows: Specifies how many rows should be loaded. 361 If `None`, all rows are loaded 362 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 363 Any rows before the `header_row` are automatically skipped. 364 It means row indices are relative to data rows, not the sheet! 365 Can be one of: 366 - `int`: Skip this many rows after the header row 367 - `list[int]`: Skip specific row indices (0-based relative to data rows) 368 - `Callable[[int], bool]`: Function that receives row index (0-based 369 relative to data rows) and returns True to skip the row 370 - `None`: If `header_row` is None, skips empty rows at beginning 371 :param schema_sample_rows: Specifies how many rows should be used to determine 372 the dtype of a column. Cannot be 0. A specific dtype can be 373 enforced for some or all columns through the `dtypes` parameter. 374 If `None`, all rows will be used. 375 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 376 will try to coerce different dtypes in a column to the same one, 377 whereas `strict` will raise an error in case a column contains 378 several dtypes. Note that this only applies to columns whose dtype 379 is guessed, i.e. not specified via `dtypes`. 380 :param use_columns: Specifies the columns to use. Can either be: 381 - `None` to select all columns 382 - A list of strings and ints, the column names and/or indices 383 (starting at 0) 384 - A string, a comma separated list of Excel column letters and column 385 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 386 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 387 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 388 ranges (e.g. `":C"` to select columns from A to C). These can be 389 combined for "except" patterns (e.g. `":C,E:"` to select everything 390 except column D) 391 - A callable, a function that takes a column and returns a boolean 392 indicating whether the column should be used 393 :param dtypes: An optional dtype (for all columns) 394 or dict of dtypes with keys as column indices or names. 395 :param eager: Specifies whether the sheet should be loaded eagerly. 396 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 397 whereas `True` will load it eagerly via `pyarrow`. 398 399 Eager loading requires the `pyarrow` extra to be installed. 400 :param skip_whitespace_tail_rows: Skip rows at the end of the sheet 401 containing only whitespace and null values. 402 :param whitespace_as_null: Consider cells containing only whitespace as null values. 403 """ 404 sheet_or_rb = self._reader.load_sheet( 405 idx_or_name=idx_or_name, 406 header_row=header_row, 407 column_names=column_names, 408 skip_rows=skip_rows, 409 n_rows=n_rows, 410 schema_sample_rows=schema_sample_rows, 411 dtype_coercion=dtype_coercion, 412 use_columns=use_columns, 413 dtypes=dtypes, 414 eager=eager, 415 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 416 whitespace_as_null=whitespace_as_null, 417 ) 418 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb) 419 420 def table_names(self, sheet_name: str | None = None) -> list[str]: 421 """The list of table names. 422 423 Will return an empty list if no tables are found. 424 425 :param sheet_name: If given, will limit the list to the given sheet, will be faster 426 too. 427 """ 428 return self._reader.table_names(sheet_name) 429 430 def defined_names(self) -> list[DefinedName]: 431 """The list of defined names (named ranges) in the workbook. 432 433 Returns a list of DefinedName objects with 'name' and 'formula' attributes. 434 The formula is a string representation of the range or expression. 435 436 Will return an empty list if no defined names are found. 437 """ 438 return self._reader.defined_names() 439 440 @typing.overload 441 def load_table( 442 self, 443 name: str, 444 *, 445 header_row: int | None = None, 446 column_names: list[str] | None = None, 447 skip_rows: int | None = None, 448 n_rows: int | None = None, 449 schema_sample_rows: int | None = 1_000, 450 dtype_coercion: Literal["coerce", "strict"] = "coerce", 451 use_columns: list[str] 452 | list[int] 453 | str 454 | Callable[[ColumnInfoNoDtype], bool] 455 | None = None, 456 dtypes: DType | DTypeMap | None = None, 457 eager: Literal[False] = ..., 458 skip_whitespace_tail_rows: bool = False, 459 whitespace_as_null: bool = False, 460 ) -> ExcelTable: ... 461 462 @typing.overload 463 def load_table( 464 self, 465 name: str, 466 *, 467 header_row: int | None = None, 468 column_names: list[str] | None = None, 469 skip_rows: int | None = None, 470 n_rows: int | None = None, 471 schema_sample_rows: int | None = 1_000, 472 dtype_coercion: Literal["coerce", "strict"] = "coerce", 473 use_columns: list[str] 474 | list[int] 475 | str 476 | Callable[[ColumnInfoNoDtype], bool] 477 | None = None, 478 dtypes: DType | DTypeMap | None = None, 479 eager: Literal[True] = ..., 480 skip_whitespace_tail_rows: bool = False, 481 whitespace_as_null: bool = False, 482 ) -> pa.RecordBatch: ... 483 484 def load_table( 485 self, 486 name: str, 487 *, 488 header_row: int | None = None, 489 column_names: list[str] | None = None, 490 skip_rows: int | None = None, 491 n_rows: int | None = None, 492 schema_sample_rows: int | None = 1_000, 493 dtype_coercion: Literal["coerce", "strict"] = "coerce", 494 use_columns: list[str] 495 | list[int] 496 | str 497 | Callable[[ColumnInfoNoDtype], bool] 498 | None = None, 499 dtypes: DType | DTypeMap | None = None, 500 eager: bool = False, 501 skip_whitespace_tail_rows: bool = False, 502 whitespace_as_null: bool = False, 503 ) -> ExcelTable | pa.RecordBatch: 504 """Loads a table by name. 505 506 :param name: The name of the table to load. 507 :param header_row: The index of the row containing the column labels. 508 If `None`, the table's column names will be used. 509 Any rows before the `header_row` will be automatically skipped. 510 :param column_names: Overrides headers found in the document. 511 If `column_names` is used, `header_row` will be ignored. 512 :param n_rows: Specifies how many rows should be loaded. 513 If `None`, all rows are loaded 514 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 515 Any rows before the `header_row` are automatically skipped. 516 If `header_row` is `None`, it skips the number of rows from the 517 start of the sheet. 518 :param schema_sample_rows: Specifies how many rows should be used to determine 519 the dtype of a column. Cannot be 0. A specific dtype can be 520 enforced for some or all columns through the `dtypes` parameter. 521 If `None`, all rows will be used. 522 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 523 will try to coerce different dtypes in a column to the same one, 524 whereas `strict` will raise an error in case a column contains 525 several dtypes. Note that this only applies to columns whose dtype 526 is guessed, i.e. not specified via `dtypes`. 527 :param use_columns: Specifies the columns to use. Can either be: 528 - `None` to select all columns 529 - A list of strings and ints, the column names and/or indices 530 (starting at 0) 531 - A string, a comma separated list of Excel column letters and column 532 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 533 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 534 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 535 ranges (e.g. `":C"` to select columns from A to C). These can be 536 combined for "except" patterns (e.g. `":C,E:"` to select everything 537 except column D) 538 - A callable, a function that takes a column and returns a boolean 539 indicating whether the column should be used 540 :param dtypes: An optional dtype (for all columns) 541 or dict of dtypes with keys as column indices or names. 542 :param eager: Specifies whether the table should be loaded eagerly. 543 `False` (default) will load the table lazily using the `PyCapsule` interface, 544 whereas `True` will load it eagerly via `pyarrow`. 545 546 Eager loading requires the `pyarrow` extra to be installed. 547 :param skip_whitespace_tail_rows: Skip rows at the end of the table 548 containing only whitespace and null values. 549 :param whitespace_as_null: Consider cells containing only whitespace as null values. 550 """ 551 if eager: 552 return self._reader.load_table( 553 name=name, 554 header_row=header_row, 555 column_names=column_names, 556 skip_rows=skip_rows, 557 n_rows=n_rows, 558 schema_sample_rows=schema_sample_rows, 559 dtype_coercion=dtype_coercion, 560 use_columns=use_columns, 561 dtypes=dtypes, 562 eager=True, 563 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 564 whitespace_as_null=whitespace_as_null, 565 ) 566 else: 567 return ExcelTable( 568 self._reader.load_table( 569 name=name, 570 header_row=header_row, 571 column_names=column_names, 572 skip_rows=skip_rows, 573 n_rows=n_rows, 574 schema_sample_rows=schema_sample_rows, 575 dtype_coercion=dtype_coercion, 576 use_columns=use_columns, 577 dtypes=dtypes, 578 eager=False, 579 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 580 whitespace_as_null=whitespace_as_null, 581 ) 582 ) 583 584 def load_sheet_eager( 585 self, 586 idx_or_name: int | str, 587 *, 588 header_row: int | None = 0, 589 column_names: list[str] | None = None, 590 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 591 n_rows: int | None = None, 592 schema_sample_rows: int | None = 1_000, 593 dtype_coercion: Literal["coerce", "strict"] = "coerce", 594 use_columns: list[str] | list[int] | str | None = None, 595 dtypes: DType | DTypeMap | None = None, 596 ) -> pa.RecordBatch: 597 """Loads a sheet eagerly by index or name. 598 599 For xlsx files, this will be faster and more memory-efficient, as it will use 600 `worksheet_range_ref` under the hood, which returns borrowed types. 601 602 Refer to `load_sheet` for parameter documentation 603 604 Requires the `pyarrow` extra to be installed. 605 """ 606 return self._reader.load_sheet( 607 idx_or_name=idx_or_name, 608 header_row=header_row, 609 column_names=column_names, 610 skip_rows=skip_rows, 611 n_rows=n_rows, 612 schema_sample_rows=schema_sample_rows, 613 dtype_coercion=dtype_coercion, 614 use_columns=use_columns, 615 dtypes=dtypes, 616 eager=True, 617 ) 618 619 def load_sheet_by_name( 620 self, 621 name: str, 622 *, 623 header_row: int | None = 0, 624 column_names: list[str] | None = None, 625 skip_rows: int | None = None, 626 n_rows: int | None = None, 627 schema_sample_rows: int | None = 1_000, 628 dtype_coercion: Literal["coerce", "strict"] = "coerce", 629 use_columns: list[str] 630 | list[int] 631 | str 632 | Callable[[ColumnInfoNoDtype], bool] 633 | None = None, 634 dtypes: DType | DTypeMap | None = None, 635 ) -> ExcelSheet: 636 """Loads a sheet by name. 637 638 Refer to `load_sheet` for parameter documentation 639 """ 640 return self.load_sheet( 641 name, 642 header_row=header_row, 643 column_names=column_names, 644 skip_rows=skip_rows, 645 n_rows=n_rows, 646 schema_sample_rows=schema_sample_rows, 647 dtype_coercion=dtype_coercion, 648 use_columns=use_columns, 649 dtypes=dtypes, 650 ) 651 652 def load_sheet_by_idx( 653 self, 654 idx: int, 655 *, 656 header_row: int | None = 0, 657 column_names: list[str] | None = None, 658 skip_rows: int | None = None, 659 n_rows: int | None = None, 660 schema_sample_rows: int | None = 1_000, 661 dtype_coercion: Literal["coerce", "strict"] = "coerce", 662 use_columns: list[str] 663 | list[int] 664 | str 665 | Callable[[ColumnInfoNoDtype], bool] 666 | None = None, 667 dtypes: DType | DTypeMap | None = None, 668 ) -> ExcelSheet: 669 """Loads a sheet by index. 670 671 Refer to `load_sheet` for parameter documentation 672 """ 673 return self.load_sheet( 674 idx, 675 header_row=header_row, 676 column_names=column_names, 677 skip_rows=skip_rows, 678 n_rows=n_rows, 679 schema_sample_rows=schema_sample_rows, 680 dtype_coercion=dtype_coercion, 681 use_columns=use_columns, 682 dtypes=dtypes, 683 ) 684 685 def __repr__(self) -> str: 686 return self._reader.__repr__() 687 688 689def read_excel(source: Path | str | bytes) -> ExcelReader: 690 """Opens and loads an excel file. 691 692 :param source: The path to a file or its content as bytes 693 """ 694 if isinstance(source, str | Path): 695 source = expanduser(source) 696 return ExcelReader(_read_excel(source)) 697 698 699__all__ = ( 700 # version 701 "__version__", 702 # main entrypoint 703 "read_excel", 704 # Python types 705 "DType", 706 "DTypeMap", 707 # Excel reader 708 "ExcelReader", 709 # Excel sheet 710 "ExcelSheet", 711 # Excel table 712 "ExcelTable", 713 # Column metadata 714 "DTypeFrom", 715 "ColumnNameFrom", 716 "ColumnInfo", 717 # Defined names 718 "DefinedName", 719 # Parse error information 720 "CellError", 721 "CellErrors", 722 # Exceptions 723 "FastExcelError", 724 "CannotRetrieveCellDataError", 725 "CalamineCellError", 726 "CalamineError", 727 "SheetNotFoundError", 728 "ColumnNotFoundError", 729 "ArrowError", 730 "InvalidParametersError", 731 "UnsupportedColumnTypeCombinationError", 732)
690def read_excel(source: Path | str | bytes) -> ExcelReader: 691 """Opens and loads an excel file. 692 693 :param source: The path to a file or its content as bytes 694 """ 695 if isinstance(source, str | Path): 696 source = expanduser(source) 697 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
278class ExcelReader: 279 """A class representing an open Excel file and allowing to read its sheets""" 280 281 def __init__(self, reader: _ExcelReader) -> None: 282 self._reader = reader 283 284 @property 285 def sheet_names(self) -> list[str]: 286 """The list of sheet names""" 287 return self._reader.sheet_names 288 289 @typing.overload 290 def load_sheet( 291 self, 292 idx_or_name: int | str, 293 *, 294 header_row: int | None = 0, 295 column_names: list[str] | None = None, 296 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 297 n_rows: int | None = None, 298 schema_sample_rows: int | None = 1_000, 299 dtype_coercion: Literal["coerce", "strict"] = "coerce", 300 use_columns: list[str] 301 | list[int] 302 | str 303 | Callable[[ColumnInfoNoDtype], bool] 304 | None = None, 305 dtypes: DType | DTypeMap | None = None, 306 eager: Literal[False] = ..., 307 skip_whitespace_tail_rows: bool = False, 308 whitespace_as_null: bool = False, 309 ) -> ExcelSheet: ... 310 311 @typing.overload 312 def load_sheet( 313 self, 314 idx_or_name: int | str, 315 *, 316 header_row: int | None = 0, 317 column_names: list[str] | None = None, 318 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 319 n_rows: int | None = None, 320 schema_sample_rows: int | None = 1_000, 321 dtype_coercion: Literal["coerce", "strict"] = "coerce", 322 use_columns: list[str] 323 | list[int] 324 | str 325 | Callable[[ColumnInfoNoDtype], bool] 326 | None = None, 327 dtypes: DType | DTypeMap | None = None, 328 eager: Literal[True] = ..., 329 skip_whitespace_tail_rows: bool = False, 330 whitespace_as_null: bool = False, 331 ) -> pa.RecordBatch: ... 332 333 def load_sheet( 334 self, 335 idx_or_name: int | str, 336 *, 337 header_row: int | None = 0, 338 column_names: list[str] | None = None, 339 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 340 n_rows: int | None = None, 341 schema_sample_rows: int | None = 1_000, 342 dtype_coercion: Literal["coerce", "strict"] = "coerce", 343 use_columns: list[str] 344 | list[int] 345 | str 346 | Callable[[ColumnInfoNoDtype], bool] 347 | None = None, 348 dtypes: DType | DTypeMap | None = None, 349 eager: bool = False, 350 skip_whitespace_tail_rows: bool = False, 351 whitespace_as_null: bool = False, 352 ) -> ExcelSheet | pa.RecordBatch: 353 """Loads a sheet by index or name. 354 355 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 356 :param header_row: The index of the row containing the column labels, default index is 0. 357 If `None`, the sheet does not have any column labels. 358 Any rows before the `header_row` will be automatically skipped. 359 :param column_names: Overrides headers found in the document. 360 If `column_names` is used, `header_row` will be ignored. 361 :param n_rows: Specifies how many rows should be loaded. 362 If `None`, all rows are loaded 363 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 364 Any rows before the `header_row` are automatically skipped. 365 It means row indices are relative to data rows, not the sheet! 366 Can be one of: 367 - `int`: Skip this many rows after the header row 368 - `list[int]`: Skip specific row indices (0-based relative to data rows) 369 - `Callable[[int], bool]`: Function that receives row index (0-based 370 relative to data rows) and returns True to skip the row 371 - `None`: If `header_row` is None, skips empty rows at beginning 372 :param schema_sample_rows: Specifies how many rows should be used to determine 373 the dtype of a column. Cannot be 0. A specific dtype can be 374 enforced for some or all columns through the `dtypes` parameter. 375 If `None`, all rows will be used. 376 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 377 will try to coerce different dtypes in a column to the same one, 378 whereas `strict` will raise an error in case a column contains 379 several dtypes. Note that this only applies to columns whose dtype 380 is guessed, i.e. not specified via `dtypes`. 381 :param use_columns: Specifies the columns to use. Can either be: 382 - `None` to select all columns 383 - A list of strings and ints, the column names and/or indices 384 (starting at 0) 385 - A string, a comma separated list of Excel column letters and column 386 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 387 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 388 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 389 ranges (e.g. `":C"` to select columns from A to C). These can be 390 combined for "except" patterns (e.g. `":C,E:"` to select everything 391 except column D) 392 - A callable, a function that takes a column and returns a boolean 393 indicating whether the column should be used 394 :param dtypes: An optional dtype (for all columns) 395 or dict of dtypes with keys as column indices or names. 396 :param eager: Specifies whether the sheet should be loaded eagerly. 397 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 398 whereas `True` will load it eagerly via `pyarrow`. 399 400 Eager loading requires the `pyarrow` extra to be installed. 401 :param skip_whitespace_tail_rows: Skip rows at the end of the sheet 402 containing only whitespace and null values. 403 :param whitespace_as_null: Consider cells containing only whitespace as null values. 404 """ 405 sheet_or_rb = self._reader.load_sheet( 406 idx_or_name=idx_or_name, 407 header_row=header_row, 408 column_names=column_names, 409 skip_rows=skip_rows, 410 n_rows=n_rows, 411 schema_sample_rows=schema_sample_rows, 412 dtype_coercion=dtype_coercion, 413 use_columns=use_columns, 414 dtypes=dtypes, 415 eager=eager, 416 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 417 whitespace_as_null=whitespace_as_null, 418 ) 419 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb) 420 421 def table_names(self, sheet_name: str | None = None) -> list[str]: 422 """The list of table names. 423 424 Will return an empty list if no tables are found. 425 426 :param sheet_name: If given, will limit the list to the given sheet, will be faster 427 too. 428 """ 429 return self._reader.table_names(sheet_name) 430 431 def defined_names(self) -> list[DefinedName]: 432 """The list of defined names (named ranges) in the workbook. 433 434 Returns a list of DefinedName objects with 'name' and 'formula' attributes. 435 The formula is a string representation of the range or expression. 436 437 Will return an empty list if no defined names are found. 438 """ 439 return self._reader.defined_names() 440 441 @typing.overload 442 def load_table( 443 self, 444 name: str, 445 *, 446 header_row: int | None = None, 447 column_names: list[str] | None = None, 448 skip_rows: int | None = None, 449 n_rows: int | None = None, 450 schema_sample_rows: int | None = 1_000, 451 dtype_coercion: Literal["coerce", "strict"] = "coerce", 452 use_columns: list[str] 453 | list[int] 454 | str 455 | Callable[[ColumnInfoNoDtype], bool] 456 | None = None, 457 dtypes: DType | DTypeMap | None = None, 458 eager: Literal[False] = ..., 459 skip_whitespace_tail_rows: bool = False, 460 whitespace_as_null: bool = False, 461 ) -> ExcelTable: ... 462 463 @typing.overload 464 def load_table( 465 self, 466 name: str, 467 *, 468 header_row: int | None = None, 469 column_names: list[str] | None = None, 470 skip_rows: int | None = None, 471 n_rows: int | None = None, 472 schema_sample_rows: int | None = 1_000, 473 dtype_coercion: Literal["coerce", "strict"] = "coerce", 474 use_columns: list[str] 475 | list[int] 476 | str 477 | Callable[[ColumnInfoNoDtype], bool] 478 | None = None, 479 dtypes: DType | DTypeMap | None = None, 480 eager: Literal[True] = ..., 481 skip_whitespace_tail_rows: bool = False, 482 whitespace_as_null: bool = False, 483 ) -> pa.RecordBatch: ... 484 485 def load_table( 486 self, 487 name: str, 488 *, 489 header_row: int | None = None, 490 column_names: list[str] | None = None, 491 skip_rows: int | None = None, 492 n_rows: int | None = None, 493 schema_sample_rows: int | None = 1_000, 494 dtype_coercion: Literal["coerce", "strict"] = "coerce", 495 use_columns: list[str] 496 | list[int] 497 | str 498 | Callable[[ColumnInfoNoDtype], bool] 499 | None = None, 500 dtypes: DType | DTypeMap | None = None, 501 eager: bool = False, 502 skip_whitespace_tail_rows: bool = False, 503 whitespace_as_null: bool = False, 504 ) -> ExcelTable | pa.RecordBatch: 505 """Loads a table by name. 506 507 :param name: The name of the table to load. 508 :param header_row: The index of the row containing the column labels. 509 If `None`, the table's column names will be used. 510 Any rows before the `header_row` will be automatically skipped. 511 :param column_names: Overrides headers found in the document. 512 If `column_names` is used, `header_row` will be ignored. 513 :param n_rows: Specifies how many rows should be loaded. 514 If `None`, all rows are loaded 515 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 516 Any rows before the `header_row` are automatically skipped. 517 If `header_row` is `None`, it skips the number of rows from the 518 start of the sheet. 519 :param schema_sample_rows: Specifies how many rows should be used to determine 520 the dtype of a column. Cannot be 0. A specific dtype can be 521 enforced for some or all columns through the `dtypes` parameter. 522 If `None`, all rows will be used. 523 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 524 will try to coerce different dtypes in a column to the same one, 525 whereas `strict` will raise an error in case a column contains 526 several dtypes. Note that this only applies to columns whose dtype 527 is guessed, i.e. not specified via `dtypes`. 528 :param use_columns: Specifies the columns to use. Can either be: 529 - `None` to select all columns 530 - A list of strings and ints, the column names and/or indices 531 (starting at 0) 532 - A string, a comma separated list of Excel column letters and column 533 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 534 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 535 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 536 ranges (e.g. `":C"` to select columns from A to C). These can be 537 combined for "except" patterns (e.g. `":C,E:"` to select everything 538 except column D) 539 - A callable, a function that takes a column and returns a boolean 540 indicating whether the column should be used 541 :param dtypes: An optional dtype (for all columns) 542 or dict of dtypes with keys as column indices or names. 543 :param eager: Specifies whether the table should be loaded eagerly. 544 `False` (default) will load the table lazily using the `PyCapsule` interface, 545 whereas `True` will load it eagerly via `pyarrow`. 546 547 Eager loading requires the `pyarrow` extra to be installed. 548 :param skip_whitespace_tail_rows: Skip rows at the end of the table 549 containing only whitespace and null values. 550 :param whitespace_as_null: Consider cells containing only whitespace as null values. 551 """ 552 if eager: 553 return self._reader.load_table( 554 name=name, 555 header_row=header_row, 556 column_names=column_names, 557 skip_rows=skip_rows, 558 n_rows=n_rows, 559 schema_sample_rows=schema_sample_rows, 560 dtype_coercion=dtype_coercion, 561 use_columns=use_columns, 562 dtypes=dtypes, 563 eager=True, 564 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 565 whitespace_as_null=whitespace_as_null, 566 ) 567 else: 568 return ExcelTable( 569 self._reader.load_table( 570 name=name, 571 header_row=header_row, 572 column_names=column_names, 573 skip_rows=skip_rows, 574 n_rows=n_rows, 575 schema_sample_rows=schema_sample_rows, 576 dtype_coercion=dtype_coercion, 577 use_columns=use_columns, 578 dtypes=dtypes, 579 eager=False, 580 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 581 whitespace_as_null=whitespace_as_null, 582 ) 583 ) 584 585 def load_sheet_eager( 586 self, 587 idx_or_name: int | str, 588 *, 589 header_row: int | None = 0, 590 column_names: list[str] | None = None, 591 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 592 n_rows: int | None = None, 593 schema_sample_rows: int | None = 1_000, 594 dtype_coercion: Literal["coerce", "strict"] = "coerce", 595 use_columns: list[str] | list[int] | str | None = None, 596 dtypes: DType | DTypeMap | None = None, 597 ) -> pa.RecordBatch: 598 """Loads a sheet eagerly by index or name. 599 600 For xlsx files, this will be faster and more memory-efficient, as it will use 601 `worksheet_range_ref` under the hood, which returns borrowed types. 602 603 Refer to `load_sheet` for parameter documentation 604 605 Requires the `pyarrow` extra to be installed. 606 """ 607 return self._reader.load_sheet( 608 idx_or_name=idx_or_name, 609 header_row=header_row, 610 column_names=column_names, 611 skip_rows=skip_rows, 612 n_rows=n_rows, 613 schema_sample_rows=schema_sample_rows, 614 dtype_coercion=dtype_coercion, 615 use_columns=use_columns, 616 dtypes=dtypes, 617 eager=True, 618 ) 619 620 def load_sheet_by_name( 621 self, 622 name: str, 623 *, 624 header_row: int | None = 0, 625 column_names: list[str] | None = None, 626 skip_rows: int | None = None, 627 n_rows: int | None = None, 628 schema_sample_rows: int | None = 1_000, 629 dtype_coercion: Literal["coerce", "strict"] = "coerce", 630 use_columns: list[str] 631 | list[int] 632 | str 633 | Callable[[ColumnInfoNoDtype], bool] 634 | None = None, 635 dtypes: DType | DTypeMap | None = None, 636 ) -> ExcelSheet: 637 """Loads a sheet by name. 638 639 Refer to `load_sheet` for parameter documentation 640 """ 641 return self.load_sheet( 642 name, 643 header_row=header_row, 644 column_names=column_names, 645 skip_rows=skip_rows, 646 n_rows=n_rows, 647 schema_sample_rows=schema_sample_rows, 648 dtype_coercion=dtype_coercion, 649 use_columns=use_columns, 650 dtypes=dtypes, 651 ) 652 653 def load_sheet_by_idx( 654 self, 655 idx: int, 656 *, 657 header_row: int | None = 0, 658 column_names: list[str] | None = None, 659 skip_rows: int | None = None, 660 n_rows: int | None = None, 661 schema_sample_rows: int | None = 1_000, 662 dtype_coercion: Literal["coerce", "strict"] = "coerce", 663 use_columns: list[str] 664 | list[int] 665 | str 666 | Callable[[ColumnInfoNoDtype], bool] 667 | None = None, 668 dtypes: DType | DTypeMap | None = None, 669 ) -> ExcelSheet: 670 """Loads a sheet by index. 671 672 Refer to `load_sheet` for parameter documentation 673 """ 674 return self.load_sheet( 675 idx, 676 header_row=header_row, 677 column_names=column_names, 678 skip_rows=skip_rows, 679 n_rows=n_rows, 680 schema_sample_rows=schema_sample_rows, 681 dtype_coercion=dtype_coercion, 682 use_columns=use_columns, 683 dtypes=dtypes, 684 ) 685 686 def __repr__(self) -> str: 687 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
284 @property 285 def sheet_names(self) -> list[str]: 286 """The list of sheet names""" 287 return self._reader.sheet_names
The list of sheet names
333 def load_sheet( 334 self, 335 idx_or_name: int | str, 336 *, 337 header_row: int | None = 0, 338 column_names: list[str] | None = None, 339 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 340 n_rows: int | None = None, 341 schema_sample_rows: int | None = 1_000, 342 dtype_coercion: Literal["coerce", "strict"] = "coerce", 343 use_columns: list[str] 344 | list[int] 345 | str 346 | Callable[[ColumnInfoNoDtype], bool] 347 | None = None, 348 dtypes: DType | DTypeMap | None = None, 349 eager: bool = False, 350 skip_whitespace_tail_rows: bool = False, 351 whitespace_as_null: bool = False, 352 ) -> ExcelSheet | pa.RecordBatch: 353 """Loads a sheet by index or name. 354 355 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 356 :param header_row: The index of the row containing the column labels, default index is 0. 357 If `None`, the sheet does not have any column labels. 358 Any rows before the `header_row` will be automatically skipped. 359 :param column_names: Overrides headers found in the document. 360 If `column_names` is used, `header_row` will be ignored. 361 :param n_rows: Specifies how many rows should be loaded. 362 If `None`, all rows are loaded 363 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 364 Any rows before the `header_row` are automatically skipped. 365 It means row indices are relative to data rows, not the sheet! 366 Can be one of: 367 - `int`: Skip this many rows after the header row 368 - `list[int]`: Skip specific row indices (0-based relative to data rows) 369 - `Callable[[int], bool]`: Function that receives row index (0-based 370 relative to data rows) and returns True to skip the row 371 - `None`: If `header_row` is None, skips empty rows at beginning 372 :param schema_sample_rows: Specifies how many rows should be used to determine 373 the dtype of a column. Cannot be 0. A specific dtype can be 374 enforced for some or all columns through the `dtypes` parameter. 375 If `None`, all rows will be used. 376 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 377 will try to coerce different dtypes in a column to the same one, 378 whereas `strict` will raise an error in case a column contains 379 several dtypes. Note that this only applies to columns whose dtype 380 is guessed, i.e. not specified via `dtypes`. 381 :param use_columns: Specifies the columns to use. Can either be: 382 - `None` to select all columns 383 - A list of strings and ints, the column names and/or indices 384 (starting at 0) 385 - A string, a comma separated list of Excel column letters and column 386 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 387 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 388 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 389 ranges (e.g. `":C"` to select columns from A to C). These can be 390 combined for "except" patterns (e.g. `":C,E:"` to select everything 391 except column D) 392 - A callable, a function that takes a column and returns a boolean 393 indicating whether the column should be used 394 :param dtypes: An optional dtype (for all columns) 395 or dict of dtypes with keys as column indices or names. 396 :param eager: Specifies whether the sheet should be loaded eagerly. 397 `False` (default) will load the sheet lazily using the `PyCapsule` interface, 398 whereas `True` will load it eagerly via `pyarrow`. 399 400 Eager loading requires the `pyarrow` extra to be installed. 401 :param skip_whitespace_tail_rows: Skip rows at the end of the sheet 402 containing only whitespace and null values. 403 :param whitespace_as_null: Consider cells containing only whitespace as null values. 404 """ 405 sheet_or_rb = self._reader.load_sheet( 406 idx_or_name=idx_or_name, 407 header_row=header_row, 408 column_names=column_names, 409 skip_rows=skip_rows, 410 n_rows=n_rows, 411 schema_sample_rows=schema_sample_rows, 412 dtype_coercion=dtype_coercion, 413 use_columns=use_columns, 414 dtypes=dtypes, 415 eager=eager, 416 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 417 whitespace_as_null=whitespace_as_null, 418 ) 419 return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
Loads a sheet by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None, the sheet does not have any column labels. Any rows before theheader_rowwill be automatically skipped. - column_names: Overrides headers found in the document.
If
column_namesis used,header_rowwill be ignored. - n_rows: Specifies how many rows should be loaded.
If
None, all rows are loaded - skip_rows: Specifies which rows should be skipped after the
header_row. Any rows before theheader_roware automatically skipped. It means row indices are relative to data rows, not the sheet! Can be one of:int: Skip this many rows after the header rowlist[int]: Skip specific row indices (0-based relative to data rows)Callable[[int], bool]: Function that receives row index (0-based relative to data rows) and returns True to skip the rowNone: Ifheader_rowis None, skips empty rows at beginning
- schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypesparameter. IfNone, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce(the default) will try to coerce different dtypes in a column to the same one, whereasstrictwill raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes. - use_columns: Specifies the columns to use. Can either be:
Noneto select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"or"A,C,E:F", which would result inA,B,C,D,EandA,C,E,F). Also supports open-ended ranges (e.g."B:"to select all columns from B onwards) and from-beginning ranges (e.g.":C"to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
eager: Specifies whether the sheet should be loaded eagerly.
False(default) will load the sheet lazily using thePyCapsuleinterface, whereasTruewill load it eagerly viapyarrow.Eager loading requires the `pyarrow` extra to be installed.- skip_whitespace_tail_rows: Skip rows at the end of the sheet containing only whitespace and null values.
- whitespace_as_null: Consider cells containing only whitespace as null values.
421 def table_names(self, sheet_name: str | None = None) -> list[str]: 422 """The list of table names. 423 424 Will return an empty list if no tables are found. 425 426 :param sheet_name: If given, will limit the list to the given sheet, will be faster 427 too. 428 """ 429 return self._reader.table_names(sheet_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_name: If given, will limit the list to the given sheet, will be faster too.
431 def defined_names(self) -> list[DefinedName]: 432 """The list of defined names (named ranges) in the workbook. 433 434 Returns a list of DefinedName objects with 'name' and 'formula' attributes. 435 The formula is a string representation of the range or expression. 436 437 Will return an empty list if no defined names are found. 438 """ 439 return self._reader.defined_names()
The list of defined names (named ranges) in the workbook.
Returns a list of DefinedName objects with 'name' and 'formula' attributes. The formula is a string representation of the range or expression.
Will return an empty list if no defined names are found.
485 def load_table( 486 self, 487 name: str, 488 *, 489 header_row: int | None = None, 490 column_names: list[str] | None = None, 491 skip_rows: int | None = None, 492 n_rows: int | None = None, 493 schema_sample_rows: int | None = 1_000, 494 dtype_coercion: Literal["coerce", "strict"] = "coerce", 495 use_columns: list[str] 496 | list[int] 497 | str 498 | Callable[[ColumnInfoNoDtype], bool] 499 | None = None, 500 dtypes: DType | DTypeMap | None = None, 501 eager: bool = False, 502 skip_whitespace_tail_rows: bool = False, 503 whitespace_as_null: bool = False, 504 ) -> ExcelTable | pa.RecordBatch: 505 """Loads a table by name. 506 507 :param name: The name of the table to load. 508 :param header_row: The index of the row containing the column labels. 509 If `None`, the table's column names will be used. 510 Any rows before the `header_row` will be automatically skipped. 511 :param column_names: Overrides headers found in the document. 512 If `column_names` is used, `header_row` will be ignored. 513 :param n_rows: Specifies how many rows should be loaded. 514 If `None`, all rows are loaded 515 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 516 Any rows before the `header_row` are automatically skipped. 517 If `header_row` is `None`, it skips the number of rows from the 518 start of the sheet. 519 :param schema_sample_rows: Specifies how many rows should be used to determine 520 the dtype of a column. Cannot be 0. A specific dtype can be 521 enforced for some or all columns through the `dtypes` parameter. 522 If `None`, all rows will be used. 523 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 524 will try to coerce different dtypes in a column to the same one, 525 whereas `strict` will raise an error in case a column contains 526 several dtypes. Note that this only applies to columns whose dtype 527 is guessed, i.e. not specified via `dtypes`. 528 :param use_columns: Specifies the columns to use. Can either be: 529 - `None` to select all columns 530 - A list of strings and ints, the column names and/or indices 531 (starting at 0) 532 - A string, a comma separated list of Excel column letters and column 533 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 534 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 535 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 536 ranges (e.g. `":C"` to select columns from A to C). These can be 537 combined for "except" patterns (e.g. `":C,E:"` to select everything 538 except column D) 539 - A callable, a function that takes a column and returns a boolean 540 indicating whether the column should be used 541 :param dtypes: An optional dtype (for all columns) 542 or dict of dtypes with keys as column indices or names. 543 :param eager: Specifies whether the table should be loaded eagerly. 544 `False` (default) will load the table lazily using the `PyCapsule` interface, 545 whereas `True` will load it eagerly via `pyarrow`. 546 547 Eager loading requires the `pyarrow` extra to be installed. 548 :param skip_whitespace_tail_rows: Skip rows at the end of the table 549 containing only whitespace and null values. 550 :param whitespace_as_null: Consider cells containing only whitespace as null values. 551 """ 552 if eager: 553 return self._reader.load_table( 554 name=name, 555 header_row=header_row, 556 column_names=column_names, 557 skip_rows=skip_rows, 558 n_rows=n_rows, 559 schema_sample_rows=schema_sample_rows, 560 dtype_coercion=dtype_coercion, 561 use_columns=use_columns, 562 dtypes=dtypes, 563 eager=True, 564 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 565 whitespace_as_null=whitespace_as_null, 566 ) 567 else: 568 return ExcelTable( 569 self._reader.load_table( 570 name=name, 571 header_row=header_row, 572 column_names=column_names, 573 skip_rows=skip_rows, 574 n_rows=n_rows, 575 schema_sample_rows=schema_sample_rows, 576 dtype_coercion=dtype_coercion, 577 use_columns=use_columns, 578 dtypes=dtypes, 579 eager=False, 580 skip_whitespace_tail_rows=skip_whitespace_tail_rows, 581 whitespace_as_null=whitespace_as_null, 582 ) 583 )
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None, the table's column names will be used. Any rows before theheader_rowwill be automatically skipped. - column_names: Overrides headers found in the document.
If
column_namesis used,header_rowwill be ignored. - n_rows: Specifies how many rows should be loaded.
If
None, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row. Any rows before theheader_roware automatically skipped. Ifheader_rowisNone, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypesparameter. IfNone, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce(the default) will try to coerce different dtypes in a column to the same one, whereasstrictwill raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes. - use_columns: Specifies the columns to use. Can either be:
Noneto select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"or"A,C,E:F", which would result inA,B,C,D,EandA,C,E,F). Also supports open-ended ranges (e.g."B:"to select all columns from B onwards) and from-beginning ranges (e.g.":C"to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
eager: Specifies whether the table should be loaded eagerly.
False(default) will load the table lazily using thePyCapsuleinterface, whereasTruewill load it eagerly viapyarrow.Eager loading requires the `pyarrow` extra to be installed.- skip_whitespace_tail_rows: Skip rows at the end of the table containing only whitespace and null values.
- whitespace_as_null: Consider cells containing only whitespace as null values.
585 def load_sheet_eager( 586 self, 587 idx_or_name: int | str, 588 *, 589 header_row: int | None = 0, 590 column_names: list[str] | None = None, 591 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 592 n_rows: int | None = None, 593 schema_sample_rows: int | None = 1_000, 594 dtype_coercion: Literal["coerce", "strict"] = "coerce", 595 use_columns: list[str] | list[int] | str | None = None, 596 dtypes: DType | DTypeMap | None = None, 597 ) -> pa.RecordBatch: 598 """Loads a sheet eagerly by index or name. 599 600 For xlsx files, this will be faster and more memory-efficient, as it will use 601 `worksheet_range_ref` under the hood, which returns borrowed types. 602 603 Refer to `load_sheet` for parameter documentation 604 605 Requires the `pyarrow` extra to be installed. 606 """ 607 return self._reader.load_sheet( 608 idx_or_name=idx_or_name, 609 header_row=header_row, 610 column_names=column_names, 611 skip_rows=skip_rows, 612 n_rows=n_rows, 613 schema_sample_rows=schema_sample_rows, 614 dtype_coercion=dtype_coercion, 615 use_columns=use_columns, 616 dtypes=dtypes, 617 eager=True, 618 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref under the hood, which returns borrowed types.
Refer to load_sheet for parameter documentation
Requires the pyarrow extra to be installed.
620 def load_sheet_by_name( 621 self, 622 name: str, 623 *, 624 header_row: int | None = 0, 625 column_names: list[str] | None = None, 626 skip_rows: int | None = None, 627 n_rows: int | None = None, 628 schema_sample_rows: int | None = 1_000, 629 dtype_coercion: Literal["coerce", "strict"] = "coerce", 630 use_columns: list[str] 631 | list[int] 632 | str 633 | Callable[[ColumnInfoNoDtype], bool] 634 | None = None, 635 dtypes: DType | DTypeMap | None = None, 636 ) -> ExcelSheet: 637 """Loads a sheet by name. 638 639 Refer to `load_sheet` for parameter documentation 640 """ 641 return self.load_sheet( 642 name, 643 header_row=header_row, 644 column_names=column_names, 645 skip_rows=skip_rows, 646 n_rows=n_rows, 647 schema_sample_rows=schema_sample_rows, 648 dtype_coercion=dtype_coercion, 649 use_columns=use_columns, 650 dtypes=dtypes, 651 )
Loads a sheet by name.
Refer to load_sheet for parameter documentation
653 def load_sheet_by_idx( 654 self, 655 idx: int, 656 *, 657 header_row: int | None = 0, 658 column_names: list[str] | None = None, 659 skip_rows: int | None = None, 660 n_rows: int | None = None, 661 schema_sample_rows: int | None = 1_000, 662 dtype_coercion: Literal["coerce", "strict"] = "coerce", 663 use_columns: list[str] 664 | list[int] 665 | str 666 | Callable[[ColumnInfoNoDtype], bool] 667 | None = None, 668 dtypes: DType | DTypeMap | None = None, 669 ) -> ExcelSheet: 670 """Loads a sheet by index. 671 672 Refer to `load_sheet` for parameter documentation 673 """ 674 return self.load_sheet( 675 idx, 676 header_row=header_row, 677 column_names=column_names, 678 skip_rows=skip_rows, 679 n_rows=n_rows, 680 schema_sample_rows=schema_sample_rows, 681 dtype_coercion=dtype_coercion, 682 use_columns=use_columns, 683 dtypes=dtypes, 684 )
Loads a sheet by index.
Refer to load_sheet for parameter documentation
55class ExcelSheet: 56 """A class representing a single sheet in an Excel File""" 57 58 def __init__(self, sheet: _ExcelSheet) -> None: 59 self._sheet = sheet 60 61 @property 62 def name(self) -> str: 63 """The name of the sheet""" 64 return self._sheet.name 65 66 @property 67 def width(self) -> int: 68 """The sheet's width""" 69 return self._sheet.width 70 71 @property 72 def height(self) -> int: 73 """The sheet's height, with `skip_rows` and `nrows` applied""" 74 return self._sheet.height 75 76 @property 77 def total_height(self) -> int: 78 """The sheet's total height""" 79 return self._sheet.total_height 80 81 @property 82 def selected_columns(self) -> list[ColumnInfo]: 83 """The sheet's selected columns""" 84 return self._sheet.selected_columns 85 86 def available_columns(self) -> list[ColumnInfo]: 87 """The columns available for the given sheet""" 88 return self._sheet.available_columns() 89 90 @property 91 def specified_dtypes(self) -> DTypeMap | None: 92 """The dtypes specified for the sheet""" 93 return self._sheet.specified_dtypes 94 95 @property 96 def visible(self) -> SheetVisible: 97 """The visibility of the sheet""" 98 return self._sheet.visible 99 100 def to_arrow(self) -> pa.RecordBatch: 101 """Converts the sheet to a pyarrow `RecordBatch` 102 103 Requires the `pyarrow` extra to be installed. 104 """ 105 if not _PYARROW_AVAILABLE: 106 raise ImportError( 107 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 108 ) 109 return self._sheet.to_arrow() 110 111 def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]: 112 """Converts the sheet to a pyarrow `RecordBatch` with error information. 113 114 Stores the positions of any values that cannot be parsed as the specified type and were 115 therefore converted to None. 116 117 Requires the `pyarrow` extra to be installed. 118 """ 119 if not _PYARROW_AVAILABLE: 120 raise ImportError( 121 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 122 ) 123 rb, cell_errors = self._sheet.to_arrow_with_errors() 124 if not cell_errors.errors: 125 return (rb, None) 126 return (rb, cell_errors) 127 128 def to_pandas(self) -> pd.DataFrame: 129 """Converts the sheet to a Pandas `DataFrame`. 130 131 Requires the `pandas` extra to be installed. 132 """ 133 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 134 # which we don't implement. Using pyarrow conversion for now. 135 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 136 return self.to_arrow().to_pandas() 137 138 def to_polars(self) -> pl.DataFrame: 139 """Converts the sheet to a Polars `DataFrame`. 140 141 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 142 Requires the `polars` extra to be installed. 143 """ 144 import polars as pl 145 146 return pl.DataFrame(self) 147 148 def __arrow_c_schema__(self) -> object: 149 """Export the schema as an `ArrowSchema` `PyCapsule`. 150 151 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 152 153 The Arrow PyCapsule Interface enables zero-copy data exchange with 154 Arrow-compatible libraries without requiring PyArrow as a dependency. 155 """ 156 return self._sheet.__arrow_c_schema__() 157 158 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 159 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 160 161 The optional `requested_schema` parameter allows for potential schema conversion. 162 163 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 164 165 The Arrow PyCapsule Interface enables zero-copy data exchange with 166 Arrow-compatible libraries without requiring PyArrow as a dependency. 167 """ 168 return self._sheet.__arrow_c_array__(requested_schema) 169 170 def __repr__(self) -> str: 171 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
71 @property 72 def height(self) -> int: 73 """The sheet's height, with `skip_rows` and `nrows` applied""" 74 return self._sheet.height
The sheet's height, with skip_rows and nrows applied
76 @property 77 def total_height(self) -> int: 78 """The sheet's total height""" 79 return self._sheet.total_height
The sheet's total height
81 @property 82 def selected_columns(self) -> list[ColumnInfo]: 83 """The sheet's selected columns""" 84 return self._sheet.selected_columns
The sheet's selected columns
86 def available_columns(self) -> list[ColumnInfo]: 87 """The columns available for the given sheet""" 88 return self._sheet.available_columns()
The columns available for the given sheet
90 @property 91 def specified_dtypes(self) -> DTypeMap | None: 92 """The dtypes specified for the sheet""" 93 return self._sheet.specified_dtypes
The dtypes specified for the sheet
95 @property 96 def visible(self) -> SheetVisible: 97 """The visibility of the sheet""" 98 return self._sheet.visible
The visibility of the sheet
100 def to_arrow(self) -> pa.RecordBatch: 101 """Converts the sheet to a pyarrow `RecordBatch` 102 103 Requires the `pyarrow` extra to be installed. 104 """ 105 if not _PYARROW_AVAILABLE: 106 raise ImportError( 107 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 108 ) 109 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
Requires the pyarrow extra to be installed.
111 def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]: 112 """Converts the sheet to a pyarrow `RecordBatch` with error information. 113 114 Stores the positions of any values that cannot be parsed as the specified type and were 115 therefore converted to None. 116 117 Requires the `pyarrow` extra to be installed. 118 """ 119 if not _PYARROW_AVAILABLE: 120 raise ImportError( 121 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 122 ) 123 rb, cell_errors = self._sheet.to_arrow_with_errors() 124 if not cell_errors.errors: 125 return (rb, None) 126 return (rb, cell_errors)
Converts the sheet to a pyarrow RecordBatch with error information.
Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None.
Requires the pyarrow extra to be installed.
128 def to_pandas(self) -> pd.DataFrame: 129 """Converts the sheet to a Pandas `DataFrame`. 130 131 Requires the `pandas` extra to be installed. 132 """ 133 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 134 # which we don't implement. Using pyarrow conversion for now. 135 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 136 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame.
Requires the pandas extra to be installed.
138 def to_polars(self) -> pl.DataFrame: 139 """Converts the sheet to a Polars `DataFrame`. 140 141 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 142 Requires the `polars` extra to be installed. 143 """ 144 import polars as pl 145 146 return pl.DataFrame(self)
Converts the sheet to a Polars DataFrame.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the polars extra to be installed.
174class ExcelTable: 175 """A class representing a single table in an Excel file""" 176 177 def __init__(self, table: _ExcelTable) -> None: 178 self._table = table 179 180 @property 181 def name(self) -> str: 182 """The name of the table""" 183 return self._table.name 184 185 @property 186 def sheet_name(self) -> str: 187 """The name of the sheet this table belongs to""" 188 return self._table.sheet_name 189 190 @property 191 def width(self) -> int: 192 """The table's width""" 193 return self._table.width 194 195 @property 196 def height(self) -> int: 197 """The table's height""" 198 return self._table.height 199 200 @property 201 def total_height(self) -> int: 202 """The table's total height""" 203 return self._table.total_height 204 205 @property 206 def offset(self) -> int: 207 """The table's offset before data starts""" 208 return self._table.offset 209 210 @property 211 def selected_columns(self) -> list[ColumnInfo]: 212 """The table's selected columns""" 213 return self._table.selected_columns 214 215 def available_columns(self) -> list[ColumnInfo]: 216 """The columns available for the given table""" 217 return self._table.available_columns() 218 219 @property 220 def specified_dtypes(self) -> DTypeMap | None: 221 """The dtypes specified for the table""" 222 return self._table.specified_dtypes 223 224 def to_arrow(self) -> pa.RecordBatch: 225 """Converts the table to a pyarrow `RecordBatch` 226 227 Requires the `pyarrow` extra to be installed. 228 """ 229 if not _PYARROW_AVAILABLE: 230 raise ImportError( 231 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 232 ) 233 return self._table.to_arrow() 234 235 def to_pandas(self) -> pd.DataFrame: 236 """Converts the table to a Pandas `DataFrame`. 237 238 Requires the `pandas` extra to be installed. 239 """ 240 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 241 # which we don't implement. Using pyarrow conversion for now. 242 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 243 return self.to_arrow().to_pandas() 244 245 def to_polars(self) -> pl.DataFrame: 246 """Converts the table to a Polars `DataFrame`. 247 248 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 249 Requires the `polars` extra to be installed. 250 """ 251 import polars as pl 252 253 return pl.DataFrame(self) 254 255 def __arrow_c_schema__(self) -> object: 256 """Export the schema as an `ArrowSchema` `PyCapsule`. 257 258 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 259 260 The Arrow PyCapsule Interface enables zero-copy data exchange with 261 Arrow-compatible libraries without requiring PyArrow as a dependency. 262 """ 263 return self._table.__arrow_c_schema__() 264 265 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 266 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 267 268 The optional `requested_schema` parameter allows for potential schema conversion. 269 270 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 271 272 The Arrow PyCapsule Interface enables zero-copy data exchange with 273 Arrow-compatible libraries without requiring PyArrow as a dependency. 274 """ 275 return self._table.__arrow_c_array__(requested_schema)
A class representing a single table in an Excel file
180 @property 181 def name(self) -> str: 182 """The name of the table""" 183 return self._table.name
The name of the table
185 @property 186 def sheet_name(self) -> str: 187 """The name of the sheet this table belongs to""" 188 return self._table.sheet_name
The name of the sheet this table belongs to
195 @property 196 def height(self) -> int: 197 """The table's height""" 198 return self._table.height
The table's height
200 @property 201 def total_height(self) -> int: 202 """The table's total height""" 203 return self._table.total_height
The table's total height
205 @property 206 def offset(self) -> int: 207 """The table's offset before data starts""" 208 return self._table.offset
The table's offset before data starts
210 @property 211 def selected_columns(self) -> list[ColumnInfo]: 212 """The table's selected columns""" 213 return self._table.selected_columns
The table's selected columns
215 def available_columns(self) -> list[ColumnInfo]: 216 """The columns available for the given table""" 217 return self._table.available_columns()
The columns available for the given table
219 @property 220 def specified_dtypes(self) -> DTypeMap | None: 221 """The dtypes specified for the table""" 222 return self._table.specified_dtypes
The dtypes specified for the table
224 def to_arrow(self) -> pa.RecordBatch: 225 """Converts the table to a pyarrow `RecordBatch` 226 227 Requires the `pyarrow` extra to be installed. 228 """ 229 if not _PYARROW_AVAILABLE: 230 raise ImportError( 231 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 232 ) 233 return self._table.to_arrow()
Converts the table to a pyarrow RecordBatch
Requires the pyarrow extra to be installed.
235 def to_pandas(self) -> pd.DataFrame: 236 """Converts the table to a Pandas `DataFrame`. 237 238 Requires the `pandas` extra to be installed. 239 """ 240 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 241 # which we don't implement. Using pyarrow conversion for now. 242 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 243 return self.to_arrow().to_pandas()
Converts the table to a Pandas DataFrame.
Requires the pandas extra to be installed.
245 def to_polars(self) -> pl.DataFrame: 246 """Converts the table to a Polars `DataFrame`. 247 248 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 249 Requires the `polars` extra to be installed. 250 """ 251 import polars as pl 252 253 return pl.DataFrame(self)
Converts the table to a Polars DataFrame.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the polars extra to be installed.
Metadata about a single column in a sheet.
fastexcel.DTypeFrom. How the dtype of the column was determined.
One of three possible values:
"provided_by_index": The dtype was specified via the column index"provided_by_name": The dtype was specified via the column name"guessed": The dtype was determined from the content of the column
fastexcel.ColumnNameFrom. How the name of the column was determined.
One of three possible values:
"provided": The column name was provided via theuse_columnsparameter"looked_up": The column name was looked up from the data found in the sheet"generated": The column name was generated from the column index, either becauseheader_rowwasNone, or because it could not be looked up
The base class for all fastexcel errors
Data for a given cell cannot be retrieved
calamine returned an error regarding the content of the cell
Generic calamine error
Sheet was not found
Column was not found
Generic arrow error
Provided parameters are invalid
Column contains an unsupported type combination