fastexcel
1from __future__ import annotations 2 3import sys 4import typing 5from typing import TYPE_CHECKING, Callable, Literal 6 7if sys.version_info < (3, 10): 8 from typing_extensions import TypeAlias 9else: 10 from typing import TypeAlias 11 12if TYPE_CHECKING: 13 import pandas as pd 14 import polars as pl 15 import pyarrow as pa 16 17from os.path import expanduser 18from pathlib import Path 19 20try: 21 import importlib.util 22 23 importlib.util.find_spec("pyarrow") 24 _PYARROW_AVAILABLE = True 25except ImportError: 26 _PYARROW_AVAILABLE = False 27 28from ._fastexcel import ( 29 ArrowError, 30 CalamineCellError, 31 CalamineError, 32 CannotRetrieveCellDataError, 33 CellError, 34 CellErrors, 35 ColumnInfo, 36 ColumnInfoNoDtype, 37 ColumnNotFoundError, 38 FastExcelError, 39 InvalidParametersError, 40 SheetNotFoundError, 41 UnsupportedColumnTypeCombinationError, 42 __version__, 43 _ExcelReader, 44 _ExcelSheet, 45 _ExcelTable, 46) 47from ._fastexcel import read_excel as _read_excel 48 49DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 50DTypeMap: TypeAlias = "dict[str | int, DType]" 51ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 52DTypeFrom: TypeAlias = Literal[ 53 "provided_for_all", "provided_by_index", "provided_by_name", "guessed" 54] 55SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 56 57 58class ExcelSheet: 59 """A class representing a single sheet in an Excel File""" 60 61 def __init__(self, sheet: _ExcelSheet) -> None: 62 self._sheet = sheet 63 64 @property 65 def name(self) -> str: 66 """The name of the sheet""" 67 return self._sheet.name 68 69 @property 70 def width(self) -> int: 71 """The sheet's width""" 72 return self._sheet.width 73 74 @property 75 def height(self) -> int: 76 """The sheet's height, with `skip_rows` and `nrows` applied""" 77 return self._sheet.height 78 79 @property 80 def total_height(self) -> int: 81 """The sheet's total height""" 82 return self._sheet.total_height 83 84 @property 85 def selected_columns(self) -> list[ColumnInfo]: 86 """The sheet's selected columns""" 87 return self._sheet.selected_columns 88 89 def available_columns(self) -> list[ColumnInfo]: 90 """The columns available for the given sheet""" 91 return self._sheet.available_columns() 92 93 @property 94 def specified_dtypes(self) -> DTypeMap | None: 95 """The dtypes specified for the sheet""" 96 return self._sheet.specified_dtypes 97 98 @property 99 def visible(self) -> SheetVisible: 100 """The visibility of the sheet""" 101 return self._sheet.visible 102 103 def to_arrow(self) -> "pa.RecordBatch": 104 """Converts the sheet to a pyarrow `RecordBatch`""" 105 if not _PYARROW_AVAILABLE: 106 raise ImportError( 107 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 108 ) 109 return self._sheet.to_arrow() 110 111 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 112 """Converts the sheet to a pyarrow `RecordBatch` with error information. 113 114 Stores the positions of any values that cannot be parsed as the specified type and were 115 therefore converted to None. 116 """ 117 if not _PYARROW_AVAILABLE: 118 raise ImportError( 119 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 120 ) 121 rb, cell_errors = self._sheet.to_arrow_with_errors() 122 if not cell_errors.errors: 123 return (rb, None) 124 return (rb, cell_errors) 125 126 def to_pandas(self) -> "pd.DataFrame": 127 """Converts the sheet to a Pandas `DataFrame`. 128 129 Requires the `pandas` extra to be installed. 130 """ 131 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 132 # which we don't implement. Using pyarrow conversion for now. 133 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 134 return self.to_arrow().to_pandas() 135 136 def to_polars(self) -> "pl.DataFrame": 137 """Converts the sheet to a Polars `DataFrame`. 138 139 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 140 Requires the `polars` extra to be installed. 141 """ 142 import polars as pl 143 144 return pl.DataFrame(self) 145 146 def __arrow_c_schema__(self) -> object: 147 """Export the schema as an `ArrowSchema` `PyCapsule`. 148 149 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 150 151 The Arrow PyCapsule Interface enables zero-copy data exchange with 152 Arrow-compatible libraries without requiring PyArrow as a dependency. 153 """ 154 return self._sheet.__arrow_c_schema__() 155 156 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 157 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 158 159 The optional `requested_schema` parameter allows for potential schema conversion. 160 161 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 162 163 The Arrow PyCapsule Interface enables zero-copy data exchange with 164 Arrow-compatible libraries without requiring PyArrow as a dependency. 165 """ 166 return self._sheet.__arrow_c_array__(requested_schema) 167 168 def __repr__(self) -> str: 169 return self._sheet.__repr__() 170 171 172class ExcelTable: 173 """A class representing a single table in an Excel file""" 174 175 def __init__(self, table: _ExcelTable) -> None: 176 self._table = table 177 178 @property 179 def name(self) -> str: 180 """The name of the table""" 181 return self._table.name 182 183 @property 184 def sheet_name(self) -> str: 185 """The name of the sheet this table belongs to""" 186 return self._table.sheet_name 187 188 @property 189 def width(self) -> int: 190 """The table's width""" 191 return self._table.width 192 193 @property 194 def height(self) -> int: 195 """The table's height""" 196 return self._table.height 197 198 @property 199 def total_height(self) -> int: 200 """The table's total height""" 201 return self._table.total_height 202 203 @property 204 def offset(self) -> int: 205 """The table's offset before data starts""" 206 return self._table.offset 207 208 @property 209 def selected_columns(self) -> list[ColumnInfo]: 210 """The table's selected columns""" 211 return self._table.selected_columns 212 213 def available_columns(self) -> list[ColumnInfo]: 214 """The columns available for the given table""" 215 return self._table.available_columns() 216 217 @property 218 def specified_dtypes(self) -> DTypeMap | None: 219 """The dtypes specified for the table""" 220 return self._table.specified_dtypes 221 222 def to_arrow(self) -> "pa.RecordBatch": 223 """Converts the table to a pyarrow `RecordBatch`""" 224 if not _PYARROW_AVAILABLE: 225 raise ImportError( 226 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 227 ) 228 return self._table.to_arrow() 229 230 def to_pandas(self) -> "pd.DataFrame": 231 """Converts the table to a Pandas `DataFrame`. 232 233 Requires the `pandas` extra to be installed. 234 """ 235 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 236 # which we don't implement. Using pyarrow conversion for now. 237 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 238 return self.to_arrow().to_pandas() 239 240 def to_polars(self) -> "pl.DataFrame": 241 """Converts the table to a Polars `DataFrame`. 242 243 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 244 Requires the `polars` extra to be installed. 245 """ 246 import polars as pl 247 248 return pl.DataFrame(self) 249 250 def __arrow_c_schema__(self) -> object: 251 """Export the schema as an `ArrowSchema` `PyCapsule`. 252 253 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 254 255 The Arrow PyCapsule Interface enables zero-copy data exchange with 256 Arrow-compatible libraries without requiring PyArrow as a dependency. 257 """ 258 return self._table.__arrow_c_schema__() 259 260 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 261 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 262 263 The optional `requested_schema` parameter allows for potential schema conversion. 264 265 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 266 267 The Arrow PyCapsule Interface enables zero-copy data exchange with 268 Arrow-compatible libraries without requiring PyArrow as a dependency. 269 """ 270 return self._table.__arrow_c_array__(requested_schema) 271 272 273class ExcelReader: 274 """A class representing an open Excel file and allowing to read its sheets""" 275 276 def __init__(self, reader: _ExcelReader) -> None: 277 self._reader = reader 278 279 @property 280 def sheet_names(self) -> list[str]: 281 """The list of sheet names""" 282 return self._reader.sheet_names 283 284 def load_sheet( 285 self, 286 idx_or_name: int | str, 287 *, 288 header_row: int | None = 0, 289 column_names: list[str] | None = None, 290 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 291 n_rows: int | None = None, 292 schema_sample_rows: int | None = 1_000, 293 dtype_coercion: Literal["coerce", "strict"] = "coerce", 294 use_columns: list[str] 295 | list[int] 296 | str 297 | Callable[[ColumnInfoNoDtype], bool] 298 | None = None, 299 dtypes: DType | DTypeMap | None = None, 300 ) -> ExcelSheet: 301 """Loads a sheet lazily by index or name. 302 303 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 304 :param header_row: The index of the row containing the column labels, default index is 0. 305 If `None`, the sheet does not have any column labels. 306 Any rows before the `header_row` will be automatically skipped. 307 :param column_names: Overrides headers found in the document. 308 If `column_names` is used, `header_row` will be ignored. 309 :param n_rows: Specifies how many rows should be loaded. 310 If `None`, all rows are loaded 311 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 312 Any rows before the `header_row` are automatically skipped. 313 It means row indices are relative to data rows, not the sheet! 314 Can be one of: 315 - `int`: Skip this many rows after the header row 316 - `list[int]`: Skip specific row indices (0-based relative to data rows) 317 - `Callable[[int], bool]`: Function that receives row index (0-based 318 relative to data rows) and returns True to skip the row 319 - `None`: If `header_row` is None, skips empty rows at beginning 320 :param schema_sample_rows: Specifies how many rows should be used to determine 321 the dtype of a column. Cannot be 0. A specific dtype can be 322 enforced for some or all columns through the `dtypes` parameter. 323 If `None`, all rows will be used. 324 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 325 will try to coerce different dtypes in a column to the same one, 326 whereas `strict` will raise an error in case a column contains 327 several dtypes. Note that this only applies to columns whose dtype 328 is guessed, i.e. not specified via `dtypes`. 329 :param use_columns: Specifies the columns to use. Can either be: 330 - `None` to select all columns 331 - A list of strings and ints, the column names and/or indices 332 (starting at 0) 333 - A string, a comma separated list of Excel column letters and column 334 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 335 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 336 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 337 ranges (e.g. `":C"` to select columns from A to C). These can be 338 combined for "except" patterns (e.g. `":C,E:"` to select everything 339 except column D) 340 - A callable, a function that takes a column and returns a boolean 341 indicating whether the column should be used 342 :param dtypes: An optional dtype (for all columns) 343 or dict of dtypes with keys as column indices or names. 344 """ 345 return ExcelSheet( 346 self._reader.load_sheet( 347 idx_or_name=idx_or_name, 348 header_row=header_row, 349 column_names=column_names, 350 skip_rows=skip_rows, 351 n_rows=n_rows, 352 schema_sample_rows=schema_sample_rows, 353 dtype_coercion=dtype_coercion, 354 use_columns=use_columns, 355 dtypes=dtypes, 356 eager=False, 357 ) 358 ) 359 360 def table_names(self, sheet_name: str | None = None) -> list[str]: 361 """The list of table names. 362 363 Will return an empty list if no tables are found. 364 365 :param sheet_name: If given, will limit the list to the given sheet, will be faster 366 too. 367 """ 368 return self._reader.table_names(sheet_name) 369 370 @typing.overload 371 def load_table( 372 self, 373 name: str, 374 *, 375 header_row: int | None = None, 376 column_names: list[str] | None = None, 377 skip_rows: int | None = None, 378 n_rows: int | None = None, 379 schema_sample_rows: int | None = 1_000, 380 dtype_coercion: Literal["coerce", "strict"] = "coerce", 381 use_columns: list[str] 382 | list[int] 383 | str 384 | Callable[[ColumnInfoNoDtype], bool] 385 | None = None, 386 dtypes: DType | DTypeMap | None = None, 387 eager: Literal[False] = ..., 388 ) -> ExcelTable: ... 389 390 @typing.overload 391 def load_table( 392 self, 393 name: str, 394 *, 395 header_row: int | None = None, 396 column_names: list[str] | None = None, 397 skip_rows: int | None = None, 398 n_rows: int | None = None, 399 schema_sample_rows: int | None = 1_000, 400 dtype_coercion: Literal["coerce", "strict"] = "coerce", 401 use_columns: list[str] 402 | list[int] 403 | str 404 | Callable[[ColumnInfoNoDtype], bool] 405 | None = None, 406 dtypes: DType | DTypeMap | None = None, 407 eager: Literal[True] = ..., 408 ) -> "pa.RecordBatch": ... 409 410 def load_table( 411 self, 412 name: str, 413 *, 414 header_row: int | None = None, 415 column_names: list[str] | None = None, 416 skip_rows: int | None = None, 417 n_rows: int | None = None, 418 schema_sample_rows: int | None = 1_000, 419 dtype_coercion: Literal["coerce", "strict"] = "coerce", 420 use_columns: list[str] 421 | list[int] 422 | str 423 | Callable[[ColumnInfoNoDtype], bool] 424 | None = None, 425 dtypes: DType | DTypeMap | None = None, 426 eager: bool = False, 427 ) -> "ExcelTable | pa.RecordBatch": 428 """Loads a table by name. 429 430 :param name: The name of the table to load. 431 :param header_row: The index of the row containing the column labels. 432 If `None`, the table's column names will be used. 433 Any rows before the `header_row` will be automatically skipped. 434 :param column_names: Overrides headers found in the document. 435 If `column_names` is used, `header_row` will be ignored. 436 :param n_rows: Specifies how many rows should be loaded. 437 If `None`, all rows are loaded 438 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 439 Any rows before the `header_row` are automatically skipped. 440 If `header_row` is `None`, it skips the number of rows from the 441 start of the sheet. 442 :param schema_sample_rows: Specifies how many rows should be used to determine 443 the dtype of a column. Cannot be 0. A specific dtype can be 444 enforced for some or all columns through the `dtypes` parameter. 445 If `None`, all rows will be used. 446 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 447 will try to coerce different dtypes in a column to the same one, 448 whereas `strict` will raise an error in case a column contains 449 several dtypes. Note that this only applies to columns whose dtype 450 is guessed, i.e. not specified via `dtypes`. 451 :param use_columns: Specifies the columns to use. Can either be: 452 - `None` to select all columns 453 - A list of strings and ints, the column names and/or indices 454 (starting at 0) 455 - A string, a comma separated list of Excel column letters and column 456 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 457 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 458 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 459 ranges (e.g. `":C"` to select columns from A to C). These can be 460 combined for "except" patterns (e.g. `":C,E:"` to select everything 461 except column D) 462 - A callable, a function that takes a column and returns a boolean 463 indicating whether the column should be used 464 :param dtypes: An optional dtype (for all columns) 465 or dict of dtypes with keys as column indices or names. 466 """ 467 if eager: 468 return self._reader.load_table( 469 name=name, 470 header_row=header_row, 471 column_names=column_names, 472 skip_rows=skip_rows, 473 n_rows=n_rows, 474 schema_sample_rows=schema_sample_rows, 475 dtype_coercion=dtype_coercion, 476 use_columns=use_columns, 477 dtypes=dtypes, 478 eager=True, 479 ) 480 else: 481 return ExcelTable( 482 self._reader.load_table( 483 name=name, 484 header_row=header_row, 485 column_names=column_names, 486 skip_rows=skip_rows, 487 n_rows=n_rows, 488 schema_sample_rows=schema_sample_rows, 489 dtype_coercion=dtype_coercion, 490 use_columns=use_columns, 491 dtypes=dtypes, 492 eager=False, 493 ) 494 ) 495 496 def load_sheet_eager( 497 self, 498 idx_or_name: int | str, 499 *, 500 header_row: int | None = 0, 501 column_names: list[str] | None = None, 502 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 503 n_rows: int | None = None, 504 schema_sample_rows: int | None = 1_000, 505 dtype_coercion: Literal["coerce", "strict"] = "coerce", 506 use_columns: list[str] | list[int] | str | None = None, 507 dtypes: DType | DTypeMap | None = None, 508 ) -> "pa.RecordBatch": 509 """Loads a sheet eagerly by index or name. 510 511 For xlsx files, this will be faster and more memory-efficient, as it will use 512 `worksheet_range_ref` under the hood, which returns borrowed types. 513 514 Refer to `load_sheet` for parameter documentation 515 """ 516 return self._reader.load_sheet( 517 idx_or_name=idx_or_name, 518 header_row=header_row, 519 column_names=column_names, 520 skip_rows=skip_rows, 521 n_rows=n_rows, 522 schema_sample_rows=schema_sample_rows, 523 dtype_coercion=dtype_coercion, 524 use_columns=use_columns, 525 dtypes=dtypes, 526 eager=True, 527 ) 528 529 def load_sheet_by_name( 530 self, 531 name: str, 532 *, 533 header_row: int | None = 0, 534 column_names: list[str] | None = None, 535 skip_rows: int | None = None, 536 n_rows: int | None = None, 537 schema_sample_rows: int | None = 1_000, 538 dtype_coercion: Literal["coerce", "strict"] = "coerce", 539 use_columns: list[str] 540 | list[int] 541 | str 542 | Callable[[ColumnInfoNoDtype], bool] 543 | None = None, 544 dtypes: DType | DTypeMap | None = None, 545 ) -> ExcelSheet: 546 """Loads a sheet by name. 547 548 Refer to `load_sheet` for parameter documentation 549 """ 550 return self.load_sheet( 551 name, 552 header_row=header_row, 553 column_names=column_names, 554 skip_rows=skip_rows, 555 n_rows=n_rows, 556 schema_sample_rows=schema_sample_rows, 557 dtype_coercion=dtype_coercion, 558 use_columns=use_columns, 559 dtypes=dtypes, 560 ) 561 562 def load_sheet_by_idx( 563 self, 564 idx: int, 565 *, 566 header_row: int | None = 0, 567 column_names: list[str] | None = None, 568 skip_rows: int | None = None, 569 n_rows: int | None = None, 570 schema_sample_rows: int | None = 1_000, 571 dtype_coercion: Literal["coerce", "strict"] = "coerce", 572 use_columns: list[str] 573 | list[int] 574 | str 575 | Callable[[ColumnInfoNoDtype], bool] 576 | None = None, 577 dtypes: DType | DTypeMap | None = None, 578 ) -> ExcelSheet: 579 """Loads a sheet by index. 580 581 Refer to `load_sheet` for parameter documentation 582 """ 583 return self.load_sheet( 584 idx, 585 header_row=header_row, 586 column_names=column_names, 587 skip_rows=skip_rows, 588 n_rows=n_rows, 589 schema_sample_rows=schema_sample_rows, 590 dtype_coercion=dtype_coercion, 591 use_columns=use_columns, 592 dtypes=dtypes, 593 ) 594 595 def __repr__(self) -> str: 596 return self._reader.__repr__() 597 598 599def read_excel(source: Path | str | bytes) -> ExcelReader: 600 """Opens and loads an excel file. 601 602 :param source: The path to a file or its content as bytes 603 """ 604 if isinstance(source, (str, Path)): 605 source = expanduser(source) 606 return ExcelReader(_read_excel(source)) 607 608 609__all__ = ( 610 # version 611 "__version__", 612 # main entrypoint 613 "read_excel", 614 # Python types 615 "DType", 616 "DTypeMap", 617 # Excel reader 618 "ExcelReader", 619 # Excel sheet 620 "ExcelSheet", 621 # Column metadata 622 "DTypeFrom", 623 "ColumnNameFrom", 624 "ColumnInfo", 625 # Parse error information 626 "CellError", 627 "CellErrors", 628 # Exceptions 629 "FastExcelError", 630 "CannotRetrieveCellDataError", 631 "CalamineCellError", 632 "CalamineError", 633 "SheetNotFoundError", 634 "ColumnNotFoundError", 635 "ArrowError", 636 "InvalidParametersError", 637 "UnsupportedColumnTypeCombinationError", 638)
600def read_excel(source: Path | str | bytes) -> ExcelReader: 601 """Opens and loads an excel file. 602 603 :param source: The path to a file or its content as bytes 604 """ 605 if isinstance(source, (str, Path)): 606 source = expanduser(source) 607 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
274class ExcelReader: 275 """A class representing an open Excel file and allowing to read its sheets""" 276 277 def __init__(self, reader: _ExcelReader) -> None: 278 self._reader = reader 279 280 @property 281 def sheet_names(self) -> list[str]: 282 """The list of sheet names""" 283 return self._reader.sheet_names 284 285 def load_sheet( 286 self, 287 idx_or_name: int | str, 288 *, 289 header_row: int | None = 0, 290 column_names: list[str] | None = None, 291 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 292 n_rows: int | None = None, 293 schema_sample_rows: int | None = 1_000, 294 dtype_coercion: Literal["coerce", "strict"] = "coerce", 295 use_columns: list[str] 296 | list[int] 297 | str 298 | Callable[[ColumnInfoNoDtype], bool] 299 | None = None, 300 dtypes: DType | DTypeMap | None = None, 301 ) -> ExcelSheet: 302 """Loads a sheet lazily by index or name. 303 304 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 305 :param header_row: The index of the row containing the column labels, default index is 0. 306 If `None`, the sheet does not have any column labels. 307 Any rows before the `header_row` will be automatically skipped. 308 :param column_names: Overrides headers found in the document. 309 If `column_names` is used, `header_row` will be ignored. 310 :param n_rows: Specifies how many rows should be loaded. 311 If `None`, all rows are loaded 312 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 313 Any rows before the `header_row` are automatically skipped. 314 It means row indices are relative to data rows, not the sheet! 315 Can be one of: 316 - `int`: Skip this many rows after the header row 317 - `list[int]`: Skip specific row indices (0-based relative to data rows) 318 - `Callable[[int], bool]`: Function that receives row index (0-based 319 relative to data rows) and returns True to skip the row 320 - `None`: If `header_row` is None, skips empty rows at beginning 321 :param schema_sample_rows: Specifies how many rows should be used to determine 322 the dtype of a column. Cannot be 0. A specific dtype can be 323 enforced for some or all columns through the `dtypes` parameter. 324 If `None`, all rows will be used. 325 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 326 will try to coerce different dtypes in a column to the same one, 327 whereas `strict` will raise an error in case a column contains 328 several dtypes. Note that this only applies to columns whose dtype 329 is guessed, i.e. not specified via `dtypes`. 330 :param use_columns: Specifies the columns to use. Can either be: 331 - `None` to select all columns 332 - A list of strings and ints, the column names and/or indices 333 (starting at 0) 334 - A string, a comma separated list of Excel column letters and column 335 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 336 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 337 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 338 ranges (e.g. `":C"` to select columns from A to C). These can be 339 combined for "except" patterns (e.g. `":C,E:"` to select everything 340 except column D) 341 - A callable, a function that takes a column and returns a boolean 342 indicating whether the column should be used 343 :param dtypes: An optional dtype (for all columns) 344 or dict of dtypes with keys as column indices or names. 345 """ 346 return ExcelSheet( 347 self._reader.load_sheet( 348 idx_or_name=idx_or_name, 349 header_row=header_row, 350 column_names=column_names, 351 skip_rows=skip_rows, 352 n_rows=n_rows, 353 schema_sample_rows=schema_sample_rows, 354 dtype_coercion=dtype_coercion, 355 use_columns=use_columns, 356 dtypes=dtypes, 357 eager=False, 358 ) 359 ) 360 361 def table_names(self, sheet_name: str | None = None) -> list[str]: 362 """The list of table names. 363 364 Will return an empty list if no tables are found. 365 366 :param sheet_name: If given, will limit the list to the given sheet, will be faster 367 too. 368 """ 369 return self._reader.table_names(sheet_name) 370 371 @typing.overload 372 def load_table( 373 self, 374 name: str, 375 *, 376 header_row: int | None = None, 377 column_names: list[str] | None = None, 378 skip_rows: int | None = None, 379 n_rows: int | None = None, 380 schema_sample_rows: int | None = 1_000, 381 dtype_coercion: Literal["coerce", "strict"] = "coerce", 382 use_columns: list[str] 383 | list[int] 384 | str 385 | Callable[[ColumnInfoNoDtype], bool] 386 | None = None, 387 dtypes: DType | DTypeMap | None = None, 388 eager: Literal[False] = ..., 389 ) -> ExcelTable: ... 390 391 @typing.overload 392 def load_table( 393 self, 394 name: str, 395 *, 396 header_row: int | None = None, 397 column_names: list[str] | None = None, 398 skip_rows: int | None = None, 399 n_rows: int | None = None, 400 schema_sample_rows: int | None = 1_000, 401 dtype_coercion: Literal["coerce", "strict"] = "coerce", 402 use_columns: list[str] 403 | list[int] 404 | str 405 | Callable[[ColumnInfoNoDtype], bool] 406 | None = None, 407 dtypes: DType | DTypeMap | None = None, 408 eager: Literal[True] = ..., 409 ) -> "pa.RecordBatch": ... 410 411 def load_table( 412 self, 413 name: str, 414 *, 415 header_row: int | None = None, 416 column_names: list[str] | None = None, 417 skip_rows: int | None = None, 418 n_rows: int | None = None, 419 schema_sample_rows: int | None = 1_000, 420 dtype_coercion: Literal["coerce", "strict"] = "coerce", 421 use_columns: list[str] 422 | list[int] 423 | str 424 | Callable[[ColumnInfoNoDtype], bool] 425 | None = None, 426 dtypes: DType | DTypeMap | None = None, 427 eager: bool = False, 428 ) -> "ExcelTable | pa.RecordBatch": 429 """Loads a table by name. 430 431 :param name: The name of the table to load. 432 :param header_row: The index of the row containing the column labels. 433 If `None`, the table's column names will be used. 434 Any rows before the `header_row` will be automatically skipped. 435 :param column_names: Overrides headers found in the document. 436 If `column_names` is used, `header_row` will be ignored. 437 :param n_rows: Specifies how many rows should be loaded. 438 If `None`, all rows are loaded 439 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 440 Any rows before the `header_row` are automatically skipped. 441 If `header_row` is `None`, it skips the number of rows from the 442 start of the sheet. 443 :param schema_sample_rows: Specifies how many rows should be used to determine 444 the dtype of a column. Cannot be 0. A specific dtype can be 445 enforced for some or all columns through the `dtypes` parameter. 446 If `None`, all rows will be used. 447 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 448 will try to coerce different dtypes in a column to the same one, 449 whereas `strict` will raise an error in case a column contains 450 several dtypes. Note that this only applies to columns whose dtype 451 is guessed, i.e. not specified via `dtypes`. 452 :param use_columns: Specifies the columns to use. Can either be: 453 - `None` to select all columns 454 - A list of strings and ints, the column names and/or indices 455 (starting at 0) 456 - A string, a comma separated list of Excel column letters and column 457 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 458 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 459 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 460 ranges (e.g. `":C"` to select columns from A to C). These can be 461 combined for "except" patterns (e.g. `":C,E:"` to select everything 462 except column D) 463 - A callable, a function that takes a column and returns a boolean 464 indicating whether the column should be used 465 :param dtypes: An optional dtype (for all columns) 466 or dict of dtypes with keys as column indices or names. 467 """ 468 if eager: 469 return self._reader.load_table( 470 name=name, 471 header_row=header_row, 472 column_names=column_names, 473 skip_rows=skip_rows, 474 n_rows=n_rows, 475 schema_sample_rows=schema_sample_rows, 476 dtype_coercion=dtype_coercion, 477 use_columns=use_columns, 478 dtypes=dtypes, 479 eager=True, 480 ) 481 else: 482 return ExcelTable( 483 self._reader.load_table( 484 name=name, 485 header_row=header_row, 486 column_names=column_names, 487 skip_rows=skip_rows, 488 n_rows=n_rows, 489 schema_sample_rows=schema_sample_rows, 490 dtype_coercion=dtype_coercion, 491 use_columns=use_columns, 492 dtypes=dtypes, 493 eager=False, 494 ) 495 ) 496 497 def load_sheet_eager( 498 self, 499 idx_or_name: int | str, 500 *, 501 header_row: int | None = 0, 502 column_names: list[str] | None = None, 503 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 504 n_rows: int | None = None, 505 schema_sample_rows: int | None = 1_000, 506 dtype_coercion: Literal["coerce", "strict"] = "coerce", 507 use_columns: list[str] | list[int] | str | None = None, 508 dtypes: DType | DTypeMap | None = None, 509 ) -> "pa.RecordBatch": 510 """Loads a sheet eagerly by index or name. 511 512 For xlsx files, this will be faster and more memory-efficient, as it will use 513 `worksheet_range_ref` under the hood, which returns borrowed types. 514 515 Refer to `load_sheet` for parameter documentation 516 """ 517 return self._reader.load_sheet( 518 idx_or_name=idx_or_name, 519 header_row=header_row, 520 column_names=column_names, 521 skip_rows=skip_rows, 522 n_rows=n_rows, 523 schema_sample_rows=schema_sample_rows, 524 dtype_coercion=dtype_coercion, 525 use_columns=use_columns, 526 dtypes=dtypes, 527 eager=True, 528 ) 529 530 def load_sheet_by_name( 531 self, 532 name: str, 533 *, 534 header_row: int | None = 0, 535 column_names: list[str] | None = None, 536 skip_rows: int | None = None, 537 n_rows: int | None = None, 538 schema_sample_rows: int | None = 1_000, 539 dtype_coercion: Literal["coerce", "strict"] = "coerce", 540 use_columns: list[str] 541 | list[int] 542 | str 543 | Callable[[ColumnInfoNoDtype], bool] 544 | None = None, 545 dtypes: DType | DTypeMap | None = None, 546 ) -> ExcelSheet: 547 """Loads a sheet by name. 548 549 Refer to `load_sheet` for parameter documentation 550 """ 551 return self.load_sheet( 552 name, 553 header_row=header_row, 554 column_names=column_names, 555 skip_rows=skip_rows, 556 n_rows=n_rows, 557 schema_sample_rows=schema_sample_rows, 558 dtype_coercion=dtype_coercion, 559 use_columns=use_columns, 560 dtypes=dtypes, 561 ) 562 563 def load_sheet_by_idx( 564 self, 565 idx: int, 566 *, 567 header_row: int | None = 0, 568 column_names: list[str] | None = None, 569 skip_rows: int | None = None, 570 n_rows: int | None = None, 571 schema_sample_rows: int | None = 1_000, 572 dtype_coercion: Literal["coerce", "strict"] = "coerce", 573 use_columns: list[str] 574 | list[int] 575 | str 576 | Callable[[ColumnInfoNoDtype], bool] 577 | None = None, 578 dtypes: DType | DTypeMap | None = None, 579 ) -> ExcelSheet: 580 """Loads a sheet by index. 581 582 Refer to `load_sheet` for parameter documentation 583 """ 584 return self.load_sheet( 585 idx, 586 header_row=header_row, 587 column_names=column_names, 588 skip_rows=skip_rows, 589 n_rows=n_rows, 590 schema_sample_rows=schema_sample_rows, 591 dtype_coercion=dtype_coercion, 592 use_columns=use_columns, 593 dtypes=dtypes, 594 ) 595 596 def __repr__(self) -> str: 597 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
280 @property 281 def sheet_names(self) -> list[str]: 282 """The list of sheet names""" 283 return self._reader.sheet_names
The list of sheet names
285 def load_sheet( 286 self, 287 idx_or_name: int | str, 288 *, 289 header_row: int | None = 0, 290 column_names: list[str] | None = None, 291 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 292 n_rows: int | None = None, 293 schema_sample_rows: int | None = 1_000, 294 dtype_coercion: Literal["coerce", "strict"] = "coerce", 295 use_columns: list[str] 296 | list[int] 297 | str 298 | Callable[[ColumnInfoNoDtype], bool] 299 | None = None, 300 dtypes: DType | DTypeMap | None = None, 301 ) -> ExcelSheet: 302 """Loads a sheet lazily by index or name. 303 304 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 305 :param header_row: The index of the row containing the column labels, default index is 0. 306 If `None`, the sheet does not have any column labels. 307 Any rows before the `header_row` will be automatically skipped. 308 :param column_names: Overrides headers found in the document. 309 If `column_names` is used, `header_row` will be ignored. 310 :param n_rows: Specifies how many rows should be loaded. 311 If `None`, all rows are loaded 312 :param skip_rows: Specifies which rows should be skipped after the `header_row`. 313 Any rows before the `header_row` are automatically skipped. 314 It means row indices are relative to data rows, not the sheet! 315 Can be one of: 316 - `int`: Skip this many rows after the header row 317 - `list[int]`: Skip specific row indices (0-based relative to data rows) 318 - `Callable[[int], bool]`: Function that receives row index (0-based 319 relative to data rows) and returns True to skip the row 320 - `None`: If `header_row` is None, skips empty rows at beginning 321 :param schema_sample_rows: Specifies how many rows should be used to determine 322 the dtype of a column. Cannot be 0. A specific dtype can be 323 enforced for some or all columns through the `dtypes` parameter. 324 If `None`, all rows will be used. 325 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 326 will try to coerce different dtypes in a column to the same one, 327 whereas `strict` will raise an error in case a column contains 328 several dtypes. Note that this only applies to columns whose dtype 329 is guessed, i.e. not specified via `dtypes`. 330 :param use_columns: Specifies the columns to use. Can either be: 331 - `None` to select all columns 332 - A list of strings and ints, the column names and/or indices 333 (starting at 0) 334 - A string, a comma separated list of Excel column letters and column 335 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 336 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 337 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 338 ranges (e.g. `":C"` to select columns from A to C). These can be 339 combined for "except" patterns (e.g. `":C,E:"` to select everything 340 except column D) 341 - A callable, a function that takes a column and returns a boolean 342 indicating whether the column should be used 343 :param dtypes: An optional dtype (for all columns) 344 or dict of dtypes with keys as column indices or names. 345 """ 346 return ExcelSheet( 347 self._reader.load_sheet( 348 idx_or_name=idx_or_name, 349 header_row=header_row, 350 column_names=column_names, 351 skip_rows=skip_rows, 352 n_rows=n_rows, 353 schema_sample_rows=schema_sample_rows, 354 dtype_coercion=dtype_coercion, 355 use_columns=use_columns, 356 dtypes=dtypes, 357 eager=False, 358 ) 359 )
Loads a sheet lazily by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None
, the sheet does not have any column labels. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies which rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. It means row indices are relative to data rows, not the sheet! Can be one of:int
: Skip this many rows after the header rowlist[int]
: Skip specific row indices (0-based relative to data rows)Callable[[int], bool]
: Function that receives row index (0-based relative to data rows) and returns True to skip the rowNone
: Ifheader_row
is None, skips empty rows at beginning
- schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"
or"A,C,E:F"
, which would result inA,B,C,D,E
andA,C,E,F
). Also supports open-ended ranges (e.g."B:"
to select all columns from B onwards) and from-beginning ranges (e.g.":C"
to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"
to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
361 def table_names(self, sheet_name: str | None = None) -> list[str]: 362 """The list of table names. 363 364 Will return an empty list if no tables are found. 365 366 :param sheet_name: If given, will limit the list to the given sheet, will be faster 367 too. 368 """ 369 return self._reader.table_names(sheet_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_name: If given, will limit the list to the given sheet, will be faster too.
411 def load_table( 412 self, 413 name: str, 414 *, 415 header_row: int | None = None, 416 column_names: list[str] | None = None, 417 skip_rows: int | None = None, 418 n_rows: int | None = None, 419 schema_sample_rows: int | None = 1_000, 420 dtype_coercion: Literal["coerce", "strict"] = "coerce", 421 use_columns: list[str] 422 | list[int] 423 | str 424 | Callable[[ColumnInfoNoDtype], bool] 425 | None = None, 426 dtypes: DType | DTypeMap | None = None, 427 eager: bool = False, 428 ) -> "ExcelTable | pa.RecordBatch": 429 """Loads a table by name. 430 431 :param name: The name of the table to load. 432 :param header_row: The index of the row containing the column labels. 433 If `None`, the table's column names will be used. 434 Any rows before the `header_row` will be automatically skipped. 435 :param column_names: Overrides headers found in the document. 436 If `column_names` is used, `header_row` will be ignored. 437 :param n_rows: Specifies how many rows should be loaded. 438 If `None`, all rows are loaded 439 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 440 Any rows before the `header_row` are automatically skipped. 441 If `header_row` is `None`, it skips the number of rows from the 442 start of the sheet. 443 :param schema_sample_rows: Specifies how many rows should be used to determine 444 the dtype of a column. Cannot be 0. A specific dtype can be 445 enforced for some or all columns through the `dtypes` parameter. 446 If `None`, all rows will be used. 447 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 448 will try to coerce different dtypes in a column to the same one, 449 whereas `strict` will raise an error in case a column contains 450 several dtypes. Note that this only applies to columns whose dtype 451 is guessed, i.e. not specified via `dtypes`. 452 :param use_columns: Specifies the columns to use. Can either be: 453 - `None` to select all columns 454 - A list of strings and ints, the column names and/or indices 455 (starting at 0) 456 - A string, a comma separated list of Excel column letters and column 457 ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in 458 `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges 459 (e.g. `"B:"` to select all columns from B onwards) and from-beginning 460 ranges (e.g. `":C"` to select columns from A to C). These can be 461 combined for "except" patterns (e.g. `":C,E:"` to select everything 462 except column D) 463 - A callable, a function that takes a column and returns a boolean 464 indicating whether the column should be used 465 :param dtypes: An optional dtype (for all columns) 466 or dict of dtypes with keys as column indices or names. 467 """ 468 if eager: 469 return self._reader.load_table( 470 name=name, 471 header_row=header_row, 472 column_names=column_names, 473 skip_rows=skip_rows, 474 n_rows=n_rows, 475 schema_sample_rows=schema_sample_rows, 476 dtype_coercion=dtype_coercion, 477 use_columns=use_columns, 478 dtypes=dtypes, 479 eager=True, 480 ) 481 else: 482 return ExcelTable( 483 self._reader.load_table( 484 name=name, 485 header_row=header_row, 486 column_names=column_names, 487 skip_rows=skip_rows, 488 n_rows=n_rows, 489 schema_sample_rows=schema_sample_rows, 490 dtype_coercion=dtype_coercion, 491 use_columns=use_columns, 492 dtypes=dtypes, 493 eager=False, 494 ) 495 )
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None
, the table's column names will be used. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
"A:E"
or"A,C,E:F"
, which would result inA,B,C,D,E
andA,C,E,F
). Also supports open-ended ranges (e.g."B:"
to select all columns from B onwards) and from-beginning ranges (e.g.":C"
to select columns from A to C). These can be combined for "except" patterns (e.g.":C,E:"
to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
497 def load_sheet_eager( 498 self, 499 idx_or_name: int | str, 500 *, 501 header_row: int | None = 0, 502 column_names: list[str] | None = None, 503 skip_rows: int | list[int] | Callable[[int], bool] | None = None, 504 n_rows: int | None = None, 505 schema_sample_rows: int | None = 1_000, 506 dtype_coercion: Literal["coerce", "strict"] = "coerce", 507 use_columns: list[str] | list[int] | str | None = None, 508 dtypes: DType | DTypeMap | None = None, 509 ) -> "pa.RecordBatch": 510 """Loads a sheet eagerly by index or name. 511 512 For xlsx files, this will be faster and more memory-efficient, as it will use 513 `worksheet_range_ref` under the hood, which returns borrowed types. 514 515 Refer to `load_sheet` for parameter documentation 516 """ 517 return self._reader.load_sheet( 518 idx_or_name=idx_or_name, 519 header_row=header_row, 520 column_names=column_names, 521 skip_rows=skip_rows, 522 n_rows=n_rows, 523 schema_sample_rows=schema_sample_rows, 524 dtype_coercion=dtype_coercion, 525 use_columns=use_columns, 526 dtypes=dtypes, 527 eager=True, 528 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref
under the hood, which returns borrowed types.
Refer to load_sheet
for parameter documentation
530 def load_sheet_by_name( 531 self, 532 name: str, 533 *, 534 header_row: int | None = 0, 535 column_names: list[str] | None = None, 536 skip_rows: int | None = None, 537 n_rows: int | None = None, 538 schema_sample_rows: int | None = 1_000, 539 dtype_coercion: Literal["coerce", "strict"] = "coerce", 540 use_columns: list[str] 541 | list[int] 542 | str 543 | Callable[[ColumnInfoNoDtype], bool] 544 | None = None, 545 dtypes: DType | DTypeMap | None = None, 546 ) -> ExcelSheet: 547 """Loads a sheet by name. 548 549 Refer to `load_sheet` for parameter documentation 550 """ 551 return self.load_sheet( 552 name, 553 header_row=header_row, 554 column_names=column_names, 555 skip_rows=skip_rows, 556 n_rows=n_rows, 557 schema_sample_rows=schema_sample_rows, 558 dtype_coercion=dtype_coercion, 559 use_columns=use_columns, 560 dtypes=dtypes, 561 )
Loads a sheet by name.
Refer to load_sheet
for parameter documentation
563 def load_sheet_by_idx( 564 self, 565 idx: int, 566 *, 567 header_row: int | None = 0, 568 column_names: list[str] | None = None, 569 skip_rows: int | None = None, 570 n_rows: int | None = None, 571 schema_sample_rows: int | None = 1_000, 572 dtype_coercion: Literal["coerce", "strict"] = "coerce", 573 use_columns: list[str] 574 | list[int] 575 | str 576 | Callable[[ColumnInfoNoDtype], bool] 577 | None = None, 578 dtypes: DType | DTypeMap | None = None, 579 ) -> ExcelSheet: 580 """Loads a sheet by index. 581 582 Refer to `load_sheet` for parameter documentation 583 """ 584 return self.load_sheet( 585 idx, 586 header_row=header_row, 587 column_names=column_names, 588 skip_rows=skip_rows, 589 n_rows=n_rows, 590 schema_sample_rows=schema_sample_rows, 591 dtype_coercion=dtype_coercion, 592 use_columns=use_columns, 593 dtypes=dtypes, 594 )
Loads a sheet by index.
Refer to load_sheet
for parameter documentation
59class ExcelSheet: 60 """A class representing a single sheet in an Excel File""" 61 62 def __init__(self, sheet: _ExcelSheet) -> None: 63 self._sheet = sheet 64 65 @property 66 def name(self) -> str: 67 """The name of the sheet""" 68 return self._sheet.name 69 70 @property 71 def width(self) -> int: 72 """The sheet's width""" 73 return self._sheet.width 74 75 @property 76 def height(self) -> int: 77 """The sheet's height, with `skip_rows` and `nrows` applied""" 78 return self._sheet.height 79 80 @property 81 def total_height(self) -> int: 82 """The sheet's total height""" 83 return self._sheet.total_height 84 85 @property 86 def selected_columns(self) -> list[ColumnInfo]: 87 """The sheet's selected columns""" 88 return self._sheet.selected_columns 89 90 def available_columns(self) -> list[ColumnInfo]: 91 """The columns available for the given sheet""" 92 return self._sheet.available_columns() 93 94 @property 95 def specified_dtypes(self) -> DTypeMap | None: 96 """The dtypes specified for the sheet""" 97 return self._sheet.specified_dtypes 98 99 @property 100 def visible(self) -> SheetVisible: 101 """The visibility of the sheet""" 102 return self._sheet.visible 103 104 def to_arrow(self) -> "pa.RecordBatch": 105 """Converts the sheet to a pyarrow `RecordBatch`""" 106 if not _PYARROW_AVAILABLE: 107 raise ImportError( 108 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 109 ) 110 return self._sheet.to_arrow() 111 112 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 113 """Converts the sheet to a pyarrow `RecordBatch` with error information. 114 115 Stores the positions of any values that cannot be parsed as the specified type and were 116 therefore converted to None. 117 """ 118 if not _PYARROW_AVAILABLE: 119 raise ImportError( 120 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 121 ) 122 rb, cell_errors = self._sheet.to_arrow_with_errors() 123 if not cell_errors.errors: 124 return (rb, None) 125 return (rb, cell_errors) 126 127 def to_pandas(self) -> "pd.DataFrame": 128 """Converts the sheet to a Pandas `DataFrame`. 129 130 Requires the `pandas` extra to be installed. 131 """ 132 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 133 # which we don't implement. Using pyarrow conversion for now. 134 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 135 return self.to_arrow().to_pandas() 136 137 def to_polars(self) -> "pl.DataFrame": 138 """Converts the sheet to a Polars `DataFrame`. 139 140 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 141 Requires the `polars` extra to be installed. 142 """ 143 import polars as pl 144 145 return pl.DataFrame(self) 146 147 def __arrow_c_schema__(self) -> object: 148 """Export the schema as an `ArrowSchema` `PyCapsule`. 149 150 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export 151 152 The Arrow PyCapsule Interface enables zero-copy data exchange with 153 Arrow-compatible libraries without requiring PyArrow as a dependency. 154 """ 155 return self._sheet.__arrow_c_schema__() 156 157 def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: 158 """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. 159 160 The optional `requested_schema` parameter allows for potential schema conversion. 161 162 https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export 163 164 The Arrow PyCapsule Interface enables zero-copy data exchange with 165 Arrow-compatible libraries without requiring PyArrow as a dependency. 166 """ 167 return self._sheet.__arrow_c_array__(requested_schema) 168 169 def __repr__(self) -> str: 170 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
75 @property 76 def height(self) -> int: 77 """The sheet's height, with `skip_rows` and `nrows` applied""" 78 return self._sheet.height
The sheet's height, with skip_rows
and nrows
applied
80 @property 81 def total_height(self) -> int: 82 """The sheet's total height""" 83 return self._sheet.total_height
The sheet's total height
85 @property 86 def selected_columns(self) -> list[ColumnInfo]: 87 """The sheet's selected columns""" 88 return self._sheet.selected_columns
The sheet's selected columns
90 def available_columns(self) -> list[ColumnInfo]: 91 """The columns available for the given sheet""" 92 return self._sheet.available_columns()
The columns available for the given sheet
94 @property 95 def specified_dtypes(self) -> DTypeMap | None: 96 """The dtypes specified for the sheet""" 97 return self._sheet.specified_dtypes
The dtypes specified for the sheet
99 @property 100 def visible(self) -> SheetVisible: 101 """The visibility of the sheet""" 102 return self._sheet.visible
The visibility of the sheet
104 def to_arrow(self) -> "pa.RecordBatch": 105 """Converts the sheet to a pyarrow `RecordBatch`""" 106 if not _PYARROW_AVAILABLE: 107 raise ImportError( 108 "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" 109 ) 110 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
112 def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]": 113 """Converts the sheet to a pyarrow `RecordBatch` with error information. 114 115 Stores the positions of any values that cannot be parsed as the specified type and were 116 therefore converted to None. 117 """ 118 if not _PYARROW_AVAILABLE: 119 raise ImportError( 120 "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 121 ) 122 rb, cell_errors = self._sheet.to_arrow_with_errors() 123 if not cell_errors.errors: 124 return (rb, None) 125 return (rb, cell_errors)
Converts the sheet to a pyarrow RecordBatch
with error information.
Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None.
127 def to_pandas(self) -> "pd.DataFrame": 128 """Converts the sheet to a Pandas `DataFrame`. 129 130 Requires the `pandas` extra to be installed. 131 """ 132 # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ 133 # which we don't implement. Using pyarrow conversion for now. 134 # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) 135 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame
.
Requires the pandas
extra to be installed.
137 def to_polars(self) -> "pl.DataFrame": 138 """Converts the sheet to a Polars `DataFrame`. 139 140 Uses the Arrow PyCapsule Interface for zero-copy data exchange. 141 Requires the `polars` extra to be installed. 142 """ 143 import polars as pl 144 145 return pl.DataFrame(self)
Converts the sheet to a Polars DataFrame
.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the polars
extra to be installed.
Metadata about a single column in a sheet.
fastexcel.ColumnNameFrom
. How the name of the column was determined.
One of three possible values:
"provided"
: The column name was provided via theuse_columns
parameter"looked_up"
: The column name was looked up from the data found in the sheet"generated"
: The column name was generated from the column index, either becauseheader_row
wasNone
, or because it could not be looked up
fastexcel.DTypeFrom
. How the dtype of the column was determined.
One of three possible values:
"provided_by_index"
: The dtype was specified via the column index"provided_by_name"
: The dtype was specified via the column name"guessed"
: The dtype was determined from the content of the column
The base class for all fastexcel errors
Data for a given cell cannot be retrieved
calamine returned an error regarding the content of the cell
Generic calamine error
Sheet was not found
Column was not found
Generic arrow error
Provided parameters are invalid
Column contains an unsupported type combination