fastexcel
1from __future__ import annotations 2 3import sys 4import typing 5from typing import TYPE_CHECKING, Callable, Literal 6 7if sys.version_info < (3, 10): 8 from typing_extensions import TypeAlias 9else: 10 from typing import TypeAlias 11 12if TYPE_CHECKING: 13 import pandas as pd 14 import polars as pl 15 16from os.path import expanduser 17from pathlib import Path 18 19import pyarrow as pa 20 21from ._fastexcel import ( 22 ArrowError, 23 CalamineCellError, 24 CalamineError, 25 CannotRetrieveCellDataError, 26 CellError, 27 CellErrors, 28 ColumnInfo, 29 ColumnInfoNoDtype, 30 ColumnNotFoundError, 31 FastExcelError, 32 InvalidParametersError, 33 SheetNotFoundError, 34 UnsupportedColumnTypeCombinationError, 35 __version__, 36 _ExcelReader, 37 _ExcelSheet, 38 _ExcelTable, 39) 40from ._fastexcel import read_excel as _read_excel 41 42DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 43DTypeMap: TypeAlias = "dict[str | int, DType]" 44ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 45DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"] 46SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 47 48 49def _recordbatch_to_polars(rb: pa.RecordBatch) -> pl.DataFrame: 50 import polars as pl 51 52 df = pl.from_arrow(data=rb) 53 assert isinstance(df, pl.DataFrame) 54 return df 55 56 57class ExcelSheet: 58 """A class representing a single sheet in an Excel File""" 59 60 def __init__(self, sheet: _ExcelSheet) -> None: 61 self._sheet = sheet 62 63 @property 64 def name(self) -> str: 65 """The name of the sheet""" 66 return self._sheet.name 67 68 @property 69 def width(self) -> int: 70 """The sheet's width""" 71 return self._sheet.width 72 73 @property 74 def height(self) -> int: 75 """The sheet's height, with `skip_rows` and `nrows` applied""" 76 return self._sheet.height 77 78 @property 79 def total_height(self) -> int: 80 """The sheet's total height""" 81 return self._sheet.total_height 82 83 @property 84 def selected_columns(self) -> list[ColumnInfo]: 85 """The sheet's selected columns""" 86 return self._sheet.selected_columns 87 88 def available_columns(self) -> list[ColumnInfo]: 89 """The columns available for the given sheet""" 90 return self._sheet.available_columns() 91 92 @property 93 def specified_dtypes(self) -> DTypeMap | None: 94 """The dtypes specified for the sheet""" 95 return self._sheet.specified_dtypes 96 97 @property 98 def visible(self) -> SheetVisible: 99 """The visibility of the sheet""" 100 return self._sheet.visible 101 102 def to_arrow(self) -> pa.RecordBatch: 103 """Converts the sheet to a pyarrow `RecordBatch`""" 104 return self._sheet.to_arrow() 105 106 def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]: 107 """Converts the sheet to a pyarrow `RecordBatch` with error information. 108 109 Stores the positions of any values that cannot be parsed as the specified type and were 110 therefore converted to None. 111 """ 112 rb, cell_errors = self._sheet.to_arrow_with_errors() 113 if not cell_errors.errors: 114 return (rb, None) 115 return (rb, cell_errors) 116 117 def to_pandas(self) -> "pd.DataFrame": 118 """Converts the sheet to a Pandas `DataFrame`. 119 120 Requires the `pandas` extra to be installed. 121 """ 122 # We know for sure that the sheet will yield exactly one RecordBatch 123 return self.to_arrow().to_pandas() 124 125 def to_polars(self) -> "pl.DataFrame": 126 """Converts the sheet to a Polars `DataFrame`. 127 128 Requires the `polars` extra to be installed. 129 """ 130 return _recordbatch_to_polars(self.to_arrow()) 131 132 def __repr__(self) -> str: 133 return self._sheet.__repr__() 134 135 136class ExcelTable: 137 """A class representing a single table in an Excel file""" 138 139 def __init__(self, table: _ExcelTable) -> None: 140 self._table = table 141 142 @property 143 def name(self) -> str: 144 """The name of the table""" 145 return self._table.name 146 147 @property 148 def sheet_name(self) -> str: 149 """The name of the sheet this table belongs to""" 150 return self._table.sheet_name 151 152 @property 153 def width(self) -> int: 154 """The table's width""" 155 return self._table.width 156 157 @property 158 def height(self) -> int: 159 """The table's height""" 160 return self._table.height 161 162 @property 163 def total_height(self) -> int: 164 """The table's total height""" 165 return self._table.total_height 166 167 @property 168 def offset(self) -> int: 169 """The table's offset before data starts""" 170 return self._table.offset 171 172 @property 173 def selected_columns(self) -> list[ColumnInfo]: 174 """The table's selected columns""" 175 return self._table.selected_columns 176 177 def available_columns(self) -> list[ColumnInfo]: 178 """The columns available for the given table""" 179 return self._table.available_columns() 180 181 @property 182 def specified_dtypes(self) -> DTypeMap | None: 183 """The dtypes specified for the table""" 184 return self._table.specified_dtypes 185 186 def to_arrow(self) -> pa.RecordBatch: 187 """Converts the table to a pyarrow `RecordBatch`""" 188 return self._table.to_arrow() 189 190 def to_pandas(self) -> "pd.DataFrame": 191 """Converts the table to a Pandas `DataFrame`. 192 193 Requires the `pandas` extra to be installed. 194 """ 195 # We know for sure that the table will yield exactly one RecordBatch 196 return self.to_arrow().to_pandas() 197 198 def to_polars(self) -> "pl.DataFrame": 199 """Converts the table to a Polars `DataFrame`. 200 201 Requires the `polars` extra to be installed. 202 """ 203 return _recordbatch_to_polars(self.to_arrow()) 204 205 206class ExcelReader: 207 """A class representing an open Excel file and allowing to read its sheets""" 208 209 def __init__(self, reader: _ExcelReader) -> None: 210 self._reader = reader 211 212 @property 213 def sheet_names(self) -> list[str]: 214 """The list of sheet names""" 215 return self._reader.sheet_names 216 217 def load_sheet( 218 self, 219 idx_or_name: int | str, 220 *, 221 header_row: int | None = 0, 222 column_names: list[str] | None = None, 223 skip_rows: int | None = None, 224 n_rows: int | None = None, 225 schema_sample_rows: int | None = 1_000, 226 dtype_coercion: Literal["coerce", "strict"] = "coerce", 227 use_columns: list[str] 228 | list[int] 229 | str 230 | Callable[[ColumnInfoNoDtype], bool] 231 | None = None, 232 dtypes: DType | DTypeMap | None = None, 233 ) -> ExcelSheet: 234 """Loads a sheet lazily by index or name. 235 236 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 237 :param header_row: The index of the row containing the column labels, default index is 0. 238 If `None`, the sheet does not have any column labels. 239 Any rows before the `header_row` will be automatically skipped. 240 :param column_names: Overrides headers found in the document. 241 If `column_names` is used, `header_row` will be ignored. 242 :param n_rows: Specifies how many rows should be loaded. 243 If `None`, all rows are loaded 244 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 245 Any rows before the `header_row` are automatically skipped. 246 If `header_row` is `None`: 247 - if `skip_rows` is `None` (default): it skips all empty rows 248 at the beginning of the sheet. 249 - if `skip_rows` is a number, it skips the specified number 250 of rows from the start of the sheet. 251 :param schema_sample_rows: Specifies how many rows should be used to determine 252 the dtype of a column. Cannot be 0. A specific dtype can be 253 enforced for some or all columns through the `dtypes` parameter. 254 If `None`, all rows will be used. 255 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 256 will try to coerce different dtypes in a column to the same one, 257 whereas `strict` will raise an error in case a column contains 258 several dtypes. Note that this only applies to columns whose dtype 259 is guessed, i.e. not specified via `dtypes`. 260 :param use_columns: Specifies the columns to use. Can either be: 261 - `None` to select all columns 262 - A list of strings and ints, the column names and/or indices 263 (starting at 0) 264 - A string, a comma separated list of Excel column letters and column 265 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 266 `A,B,C,D,E` and `A,C,E,F`) 267 - A callable, a function that takes a column and returns a boolean 268 indicating whether the column should be used 269 :param dtypes: An optional dtype (for all columns) 270 or dict of dtypes with keys as column indices or names. 271 """ 272 return ExcelSheet( 273 self._reader.load_sheet( 274 idx_or_name=idx_or_name, 275 header_row=header_row, 276 column_names=column_names, 277 skip_rows=skip_rows, 278 n_rows=n_rows, 279 schema_sample_rows=schema_sample_rows, 280 dtype_coercion=dtype_coercion, 281 use_columns=use_columns, 282 dtypes=dtypes, 283 eager=False, 284 ) 285 ) 286 287 def table_names(self, sheet_name: str | None = None) -> list[str]: 288 """The list of table names. 289 290 Will return an empty list if no tables are found. 291 292 :param sheet_name: If given, will limit the list to the given sheet, will be faster 293 too. 294 """ 295 return self._reader.table_names(sheet_name) 296 297 @typing.overload 298 def load_table( 299 self, 300 name: str, 301 *, 302 header_row: int | None = None, 303 column_names: list[str] | None = None, 304 skip_rows: int = 0, 305 n_rows: int | None = None, 306 schema_sample_rows: int | None = 1_000, 307 dtype_coercion: Literal["coerce", "strict"] = "coerce", 308 use_columns: list[str] 309 | list[int] 310 | str 311 | Callable[[ColumnInfoNoDtype], bool] 312 | None = None, 313 dtypes: DType | DTypeMap | None = None, 314 eager: Literal[False] = ..., 315 ) -> ExcelTable: ... 316 @typing.overload 317 def load_table( 318 self, 319 name: str, 320 *, 321 header_row: int | None = None, 322 column_names: list[str] | None = None, 323 skip_rows: int = 0, 324 n_rows: int | None = None, 325 schema_sample_rows: int | None = 1_000, 326 dtype_coercion: Literal["coerce", "strict"] = "coerce", 327 use_columns: list[str] 328 | list[int] 329 | str 330 | Callable[[ColumnInfoNoDtype], bool] 331 | None = None, 332 dtypes: DType | DTypeMap | None = None, 333 eager: Literal[True] = ..., 334 ) -> pa.RecordBatch: ... 335 def load_table( 336 self, 337 name: str, 338 *, 339 header_row: int | None = None, 340 column_names: list[str] | None = None, 341 skip_rows: int = 0, 342 n_rows: int | None = None, 343 schema_sample_rows: int | None = 1_000, 344 dtype_coercion: Literal["coerce", "strict"] = "coerce", 345 use_columns: list[str] 346 | list[int] 347 | str 348 | Callable[[ColumnInfoNoDtype], bool] 349 | None = None, 350 dtypes: DType | DTypeMap | None = None, 351 eager: bool = False, 352 ) -> ExcelTable | pa.RecordBatch: 353 """Loads a table by name. 354 355 :param name: The name of the table to load. 356 :param header_row: The index of the row containing the column labels. 357 If `None`, the table's column names will be used. 358 Any rows before the `header_row` will be automatically skipped. 359 :param column_names: Overrides headers found in the document. 360 If `column_names` is used, `header_row` will be ignored. 361 :param n_rows: Specifies how many rows should be loaded. 362 If `None`, all rows are loaded 363 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 364 Any rows before the `header_row` are automatically skipped. 365 If `header_row` is `None`, it skips the number of rows from the 366 start of the sheet. 367 :param schema_sample_rows: Specifies how many rows should be used to determine 368 the dtype of a column. Cannot be 0. A specific dtype can be 369 enforced for some or all columns through the `dtypes` parameter. 370 If `None`, all rows will be used. 371 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 372 will try to coerce different dtypes in a column to the same one, 373 whereas `strict` will raise an error in case a column contains 374 several dtypes. Note that this only applies to columns whose dtype 375 is guessed, i.e. not specified via `dtypes`. 376 :param use_columns: Specifies the columns to use. Can either be: 377 - `None` to select all columns 378 - A list of strings and ints, the column names and/or indices 379 (starting at 0) 380 - A string, a comma separated list of Excel column letters and column 381 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 382 `A,B,C,D,E` and `A,C,E,F`) 383 - A callable, a function that takes a column and returns a boolean 384 indicating whether the column should be used 385 :param dtypes: An optional dtype (for all columns) 386 or dict of dtypes with keys as column indices or names. 387 """ 388 output = self._reader.load_table( # type:ignore[call-overload,misc] 389 name=name, 390 header_row=header_row, 391 column_names=column_names, 392 skip_rows=skip_rows, 393 n_rows=n_rows, 394 schema_sample_rows=schema_sample_rows, 395 dtype_coercion=dtype_coercion, 396 use_columns=use_columns, 397 dtypes=dtypes, 398 eager=eager, 399 ) 400 if eager: 401 return output 402 return ExcelTable(output) 403 404 def load_sheet_eager( 405 self, 406 idx_or_name: int | str, 407 *, 408 header_row: int | None = 0, 409 column_names: list[str] | None = None, 410 skip_rows: int | None = None, 411 n_rows: int | None = None, 412 schema_sample_rows: int | None = 1_000, 413 dtype_coercion: Literal["coerce", "strict"] = "coerce", 414 use_columns: list[str] | list[int] | str | None = None, 415 dtypes: DType | DTypeMap | None = None, 416 ) -> pa.RecordBatch: 417 """Loads a sheet eagerly by index or name. 418 419 For xlsx files, this will be faster and more memory-efficient, as it will use 420 `worksheet_range_ref` under the hood, which returns borrowed types. 421 422 Refer to `load_sheet` for parameter documentation 423 """ 424 return self._reader.load_sheet( 425 idx_or_name=idx_or_name, 426 header_row=header_row, 427 column_names=column_names, 428 skip_rows=skip_rows, 429 n_rows=n_rows, 430 schema_sample_rows=schema_sample_rows, 431 dtype_coercion=dtype_coercion, 432 use_columns=use_columns, 433 dtypes=dtypes, 434 eager=True, 435 ) 436 437 def load_sheet_by_name( 438 self, 439 name: str, 440 *, 441 header_row: int | None = 0, 442 column_names: list[str] | None = None, 443 skip_rows: int | None = None, 444 n_rows: int | None = None, 445 schema_sample_rows: int | None = 1_000, 446 dtype_coercion: Literal["coerce", "strict"] = "coerce", 447 use_columns: list[str] 448 | list[int] 449 | str 450 | Callable[[ColumnInfoNoDtype], bool] 451 | None = None, 452 dtypes: DType | DTypeMap | None = None, 453 ) -> ExcelSheet: 454 """Loads a sheet by name. 455 456 Refer to `load_sheet` for parameter documentation 457 """ 458 return self.load_sheet( 459 name, 460 header_row=header_row, 461 column_names=column_names, 462 skip_rows=skip_rows, 463 n_rows=n_rows, 464 schema_sample_rows=schema_sample_rows, 465 dtype_coercion=dtype_coercion, 466 use_columns=use_columns, 467 dtypes=dtypes, 468 ) 469 470 def load_sheet_by_idx( 471 self, 472 idx: int, 473 *, 474 header_row: int | None = 0, 475 column_names: list[str] | None = None, 476 skip_rows: int | None = None, 477 n_rows: int | None = None, 478 schema_sample_rows: int | None = 1_000, 479 dtype_coercion: Literal["coerce", "strict"] = "coerce", 480 use_columns: list[str] 481 | list[int] 482 | str 483 | Callable[[ColumnInfoNoDtype], bool] 484 | None = None, 485 dtypes: DType | DTypeMap | None = None, 486 ) -> ExcelSheet: 487 """Loads a sheet by index. 488 489 Refer to `load_sheet` for parameter documentation 490 """ 491 return self.load_sheet( 492 idx, 493 header_row=header_row, 494 column_names=column_names, 495 skip_rows=skip_rows, 496 n_rows=n_rows, 497 schema_sample_rows=schema_sample_rows, 498 dtype_coercion=dtype_coercion, 499 use_columns=use_columns, 500 dtypes=dtypes, 501 ) 502 503 def __repr__(self) -> str: 504 return self._reader.__repr__() 505 506 507def read_excel(source: Path | str | bytes) -> ExcelReader: 508 """Opens and loads an excel file. 509 510 :param source: The path to a file or its content as bytes 511 """ 512 if isinstance(source, (str, Path)): 513 source = expanduser(source) 514 return ExcelReader(_read_excel(source)) 515 516 517__all__ = ( 518 ## version 519 "__version__", 520 ## main entrypoint 521 "read_excel", 522 ## Python types 523 "DType", 524 "DTypeMap", 525 # Excel reader 526 "ExcelReader", 527 # Excel sheet 528 "ExcelSheet", 529 # Column metadata 530 "DTypeFrom", 531 "ColumnNameFrom", 532 "ColumnInfo", 533 # Parse error information 534 "CellError", 535 "CellErrors", 536 # Exceptions 537 "FastExcelError", 538 "CannotRetrieveCellDataError", 539 "CalamineCellError", 540 "CalamineError", 541 "SheetNotFoundError", 542 "ColumnNotFoundError", 543 "ArrowError", 544 "InvalidParametersError", 545 "UnsupportedColumnTypeCombinationError", 546)
508def read_excel(source: Path | str | bytes) -> ExcelReader: 509 """Opens and loads an excel file. 510 511 :param source: The path to a file or its content as bytes 512 """ 513 if isinstance(source, (str, Path)): 514 source = expanduser(source) 515 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
207class ExcelReader: 208 """A class representing an open Excel file and allowing to read its sheets""" 209 210 def __init__(self, reader: _ExcelReader) -> None: 211 self._reader = reader 212 213 @property 214 def sheet_names(self) -> list[str]: 215 """The list of sheet names""" 216 return self._reader.sheet_names 217 218 def load_sheet( 219 self, 220 idx_or_name: int | str, 221 *, 222 header_row: int | None = 0, 223 column_names: list[str] | None = None, 224 skip_rows: int | None = None, 225 n_rows: int | None = None, 226 schema_sample_rows: int | None = 1_000, 227 dtype_coercion: Literal["coerce", "strict"] = "coerce", 228 use_columns: list[str] 229 | list[int] 230 | str 231 | Callable[[ColumnInfoNoDtype], bool] 232 | None = None, 233 dtypes: DType | DTypeMap | None = None, 234 ) -> ExcelSheet: 235 """Loads a sheet lazily by index or name. 236 237 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 238 :param header_row: The index of the row containing the column labels, default index is 0. 239 If `None`, the sheet does not have any column labels. 240 Any rows before the `header_row` will be automatically skipped. 241 :param column_names: Overrides headers found in the document. 242 If `column_names` is used, `header_row` will be ignored. 243 :param n_rows: Specifies how many rows should be loaded. 244 If `None`, all rows are loaded 245 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 246 Any rows before the `header_row` are automatically skipped. 247 If `header_row` is `None`: 248 - if `skip_rows` is `None` (default): it skips all empty rows 249 at the beginning of the sheet. 250 - if `skip_rows` is a number, it skips the specified number 251 of rows from the start of the sheet. 252 :param schema_sample_rows: Specifies how many rows should be used to determine 253 the dtype of a column. Cannot be 0. A specific dtype can be 254 enforced for some or all columns through the `dtypes` parameter. 255 If `None`, all rows will be used. 256 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 257 will try to coerce different dtypes in a column to the same one, 258 whereas `strict` will raise an error in case a column contains 259 several dtypes. Note that this only applies to columns whose dtype 260 is guessed, i.e. not specified via `dtypes`. 261 :param use_columns: Specifies the columns to use. Can either be: 262 - `None` to select all columns 263 - A list of strings and ints, the column names and/or indices 264 (starting at 0) 265 - A string, a comma separated list of Excel column letters and column 266 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 267 `A,B,C,D,E` and `A,C,E,F`) 268 - A callable, a function that takes a column and returns a boolean 269 indicating whether the column should be used 270 :param dtypes: An optional dtype (for all columns) 271 or dict of dtypes with keys as column indices or names. 272 """ 273 return ExcelSheet( 274 self._reader.load_sheet( 275 idx_or_name=idx_or_name, 276 header_row=header_row, 277 column_names=column_names, 278 skip_rows=skip_rows, 279 n_rows=n_rows, 280 schema_sample_rows=schema_sample_rows, 281 dtype_coercion=dtype_coercion, 282 use_columns=use_columns, 283 dtypes=dtypes, 284 eager=False, 285 ) 286 ) 287 288 def table_names(self, sheet_name: str | None = None) -> list[str]: 289 """The list of table names. 290 291 Will return an empty list if no tables are found. 292 293 :param sheet_name: If given, will limit the list to the given sheet, will be faster 294 too. 295 """ 296 return self._reader.table_names(sheet_name) 297 298 @typing.overload 299 def load_table( 300 self, 301 name: str, 302 *, 303 header_row: int | None = None, 304 column_names: list[str] | None = None, 305 skip_rows: int = 0, 306 n_rows: int | None = None, 307 schema_sample_rows: int | None = 1_000, 308 dtype_coercion: Literal["coerce", "strict"] = "coerce", 309 use_columns: list[str] 310 | list[int] 311 | str 312 | Callable[[ColumnInfoNoDtype], bool] 313 | None = None, 314 dtypes: DType | DTypeMap | None = None, 315 eager: Literal[False] = ..., 316 ) -> ExcelTable: ... 317 @typing.overload 318 def load_table( 319 self, 320 name: str, 321 *, 322 header_row: int | None = None, 323 column_names: list[str] | None = None, 324 skip_rows: int = 0, 325 n_rows: int | None = None, 326 schema_sample_rows: int | None = 1_000, 327 dtype_coercion: Literal["coerce", "strict"] = "coerce", 328 use_columns: list[str] 329 | list[int] 330 | str 331 | Callable[[ColumnInfoNoDtype], bool] 332 | None = None, 333 dtypes: DType | DTypeMap | None = None, 334 eager: Literal[True] = ..., 335 ) -> pa.RecordBatch: ... 336 def load_table( 337 self, 338 name: str, 339 *, 340 header_row: int | None = None, 341 column_names: list[str] | None = None, 342 skip_rows: int = 0, 343 n_rows: int | None = None, 344 schema_sample_rows: int | None = 1_000, 345 dtype_coercion: Literal["coerce", "strict"] = "coerce", 346 use_columns: list[str] 347 | list[int] 348 | str 349 | Callable[[ColumnInfoNoDtype], bool] 350 | None = None, 351 dtypes: DType | DTypeMap | None = None, 352 eager: bool = False, 353 ) -> ExcelTable | pa.RecordBatch: 354 """Loads a table by name. 355 356 :param name: The name of the table to load. 357 :param header_row: The index of the row containing the column labels. 358 If `None`, the table's column names will be used. 359 Any rows before the `header_row` will be automatically skipped. 360 :param column_names: Overrides headers found in the document. 361 If `column_names` is used, `header_row` will be ignored. 362 :param n_rows: Specifies how many rows should be loaded. 363 If `None`, all rows are loaded 364 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 365 Any rows before the `header_row` are automatically skipped. 366 If `header_row` is `None`, it skips the number of rows from the 367 start of the sheet. 368 :param schema_sample_rows: Specifies how many rows should be used to determine 369 the dtype of a column. Cannot be 0. A specific dtype can be 370 enforced for some or all columns through the `dtypes` parameter. 371 If `None`, all rows will be used. 372 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 373 will try to coerce different dtypes in a column to the same one, 374 whereas `strict` will raise an error in case a column contains 375 several dtypes. Note that this only applies to columns whose dtype 376 is guessed, i.e. not specified via `dtypes`. 377 :param use_columns: Specifies the columns to use. Can either be: 378 - `None` to select all columns 379 - A list of strings and ints, the column names and/or indices 380 (starting at 0) 381 - A string, a comma separated list of Excel column letters and column 382 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 383 `A,B,C,D,E` and `A,C,E,F`) 384 - A callable, a function that takes a column and returns a boolean 385 indicating whether the column should be used 386 :param dtypes: An optional dtype (for all columns) 387 or dict of dtypes with keys as column indices or names. 388 """ 389 output = self._reader.load_table( # type:ignore[call-overload,misc] 390 name=name, 391 header_row=header_row, 392 column_names=column_names, 393 skip_rows=skip_rows, 394 n_rows=n_rows, 395 schema_sample_rows=schema_sample_rows, 396 dtype_coercion=dtype_coercion, 397 use_columns=use_columns, 398 dtypes=dtypes, 399 eager=eager, 400 ) 401 if eager: 402 return output 403 return ExcelTable(output) 404 405 def load_sheet_eager( 406 self, 407 idx_or_name: int | str, 408 *, 409 header_row: int | None = 0, 410 column_names: list[str] | None = None, 411 skip_rows: int | None = None, 412 n_rows: int | None = None, 413 schema_sample_rows: int | None = 1_000, 414 dtype_coercion: Literal["coerce", "strict"] = "coerce", 415 use_columns: list[str] | list[int] | str | None = None, 416 dtypes: DType | DTypeMap | None = None, 417 ) -> pa.RecordBatch: 418 """Loads a sheet eagerly by index or name. 419 420 For xlsx files, this will be faster and more memory-efficient, as it will use 421 `worksheet_range_ref` under the hood, which returns borrowed types. 422 423 Refer to `load_sheet` for parameter documentation 424 """ 425 return self._reader.load_sheet( 426 idx_or_name=idx_or_name, 427 header_row=header_row, 428 column_names=column_names, 429 skip_rows=skip_rows, 430 n_rows=n_rows, 431 schema_sample_rows=schema_sample_rows, 432 dtype_coercion=dtype_coercion, 433 use_columns=use_columns, 434 dtypes=dtypes, 435 eager=True, 436 ) 437 438 def load_sheet_by_name( 439 self, 440 name: str, 441 *, 442 header_row: int | None = 0, 443 column_names: list[str] | None = None, 444 skip_rows: int | None = None, 445 n_rows: int | None = None, 446 schema_sample_rows: int | None = 1_000, 447 dtype_coercion: Literal["coerce", "strict"] = "coerce", 448 use_columns: list[str] 449 | list[int] 450 | str 451 | Callable[[ColumnInfoNoDtype], bool] 452 | None = None, 453 dtypes: DType | DTypeMap | None = None, 454 ) -> ExcelSheet: 455 """Loads a sheet by name. 456 457 Refer to `load_sheet` for parameter documentation 458 """ 459 return self.load_sheet( 460 name, 461 header_row=header_row, 462 column_names=column_names, 463 skip_rows=skip_rows, 464 n_rows=n_rows, 465 schema_sample_rows=schema_sample_rows, 466 dtype_coercion=dtype_coercion, 467 use_columns=use_columns, 468 dtypes=dtypes, 469 ) 470 471 def load_sheet_by_idx( 472 self, 473 idx: int, 474 *, 475 header_row: int | None = 0, 476 column_names: list[str] | None = None, 477 skip_rows: int | None = None, 478 n_rows: int | None = None, 479 schema_sample_rows: int | None = 1_000, 480 dtype_coercion: Literal["coerce", "strict"] = "coerce", 481 use_columns: list[str] 482 | list[int] 483 | str 484 | Callable[[ColumnInfoNoDtype], bool] 485 | None = None, 486 dtypes: DType | DTypeMap | None = None, 487 ) -> ExcelSheet: 488 """Loads a sheet by index. 489 490 Refer to `load_sheet` for parameter documentation 491 """ 492 return self.load_sheet( 493 idx, 494 header_row=header_row, 495 column_names=column_names, 496 skip_rows=skip_rows, 497 n_rows=n_rows, 498 schema_sample_rows=schema_sample_rows, 499 dtype_coercion=dtype_coercion, 500 use_columns=use_columns, 501 dtypes=dtypes, 502 ) 503 504 def __repr__(self) -> str: 505 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
213 @property 214 def sheet_names(self) -> list[str]: 215 """The list of sheet names""" 216 return self._reader.sheet_names
The list of sheet names
218 def load_sheet( 219 self, 220 idx_or_name: int | str, 221 *, 222 header_row: int | None = 0, 223 column_names: list[str] | None = None, 224 skip_rows: int | None = None, 225 n_rows: int | None = None, 226 schema_sample_rows: int | None = 1_000, 227 dtype_coercion: Literal["coerce", "strict"] = "coerce", 228 use_columns: list[str] 229 | list[int] 230 | str 231 | Callable[[ColumnInfoNoDtype], bool] 232 | None = None, 233 dtypes: DType | DTypeMap | None = None, 234 ) -> ExcelSheet: 235 """Loads a sheet lazily by index or name. 236 237 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 238 :param header_row: The index of the row containing the column labels, default index is 0. 239 If `None`, the sheet does not have any column labels. 240 Any rows before the `header_row` will be automatically skipped. 241 :param column_names: Overrides headers found in the document. 242 If `column_names` is used, `header_row` will be ignored. 243 :param n_rows: Specifies how many rows should be loaded. 244 If `None`, all rows are loaded 245 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 246 Any rows before the `header_row` are automatically skipped. 247 If `header_row` is `None`: 248 - if `skip_rows` is `None` (default): it skips all empty rows 249 at the beginning of the sheet. 250 - if `skip_rows` is a number, it skips the specified number 251 of rows from the start of the sheet. 252 :param schema_sample_rows: Specifies how many rows should be used to determine 253 the dtype of a column. Cannot be 0. A specific dtype can be 254 enforced for some or all columns through the `dtypes` parameter. 255 If `None`, all rows will be used. 256 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 257 will try to coerce different dtypes in a column to the same one, 258 whereas `strict` will raise an error in case a column contains 259 several dtypes. Note that this only applies to columns whose dtype 260 is guessed, i.e. not specified via `dtypes`. 261 :param use_columns: Specifies the columns to use. Can either be: 262 - `None` to select all columns 263 - A list of strings and ints, the column names and/or indices 264 (starting at 0) 265 - A string, a comma separated list of Excel column letters and column 266 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 267 `A,B,C,D,E` and `A,C,E,F`) 268 - A callable, a function that takes a column and returns a boolean 269 indicating whether the column should be used 270 :param dtypes: An optional dtype (for all columns) 271 or dict of dtypes with keys as column indices or names. 272 """ 273 return ExcelSheet( 274 self._reader.load_sheet( 275 idx_or_name=idx_or_name, 276 header_row=header_row, 277 column_names=column_names, 278 skip_rows=skip_rows, 279 n_rows=n_rows, 280 schema_sample_rows=schema_sample_rows, 281 dtype_coercion=dtype_coercion, 282 use_columns=use_columns, 283 dtypes=dtypes, 284 eager=False, 285 ) 286 )
Loads a sheet lazily by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None
, the sheet does not have any column labels. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
:- if
skip_rows
isNone
(default): it skips all empty rows at the beginning of the sheet. - if
skip_rows
is a number, it skips the specified number of rows from the start of the sheet.
- if
- schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
288 def table_names(self, sheet_name: str | None = None) -> list[str]: 289 """The list of table names. 290 291 Will return an empty list if no tables are found. 292 293 :param sheet_name: If given, will limit the list to the given sheet, will be faster 294 too. 295 """ 296 return self._reader.table_names(sheet_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_name: If given, will limit the list to the given sheet, will be faster too.
336 def load_table( 337 self, 338 name: str, 339 *, 340 header_row: int | None = None, 341 column_names: list[str] | None = None, 342 skip_rows: int = 0, 343 n_rows: int | None = None, 344 schema_sample_rows: int | None = 1_000, 345 dtype_coercion: Literal["coerce", "strict"] = "coerce", 346 use_columns: list[str] 347 | list[int] 348 | str 349 | Callable[[ColumnInfoNoDtype], bool] 350 | None = None, 351 dtypes: DType | DTypeMap | None = None, 352 eager: bool = False, 353 ) -> ExcelTable | pa.RecordBatch: 354 """Loads a table by name. 355 356 :param name: The name of the table to load. 357 :param header_row: The index of the row containing the column labels. 358 If `None`, the table's column names will be used. 359 Any rows before the `header_row` will be automatically skipped. 360 :param column_names: Overrides headers found in the document. 361 If `column_names` is used, `header_row` will be ignored. 362 :param n_rows: Specifies how many rows should be loaded. 363 If `None`, all rows are loaded 364 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 365 Any rows before the `header_row` are automatically skipped. 366 If `header_row` is `None`, it skips the number of rows from the 367 start of the sheet. 368 :param schema_sample_rows: Specifies how many rows should be used to determine 369 the dtype of a column. Cannot be 0. A specific dtype can be 370 enforced for some or all columns through the `dtypes` parameter. 371 If `None`, all rows will be used. 372 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 373 will try to coerce different dtypes in a column to the same one, 374 whereas `strict` will raise an error in case a column contains 375 several dtypes. Note that this only applies to columns whose dtype 376 is guessed, i.e. not specified via `dtypes`. 377 :param use_columns: Specifies the columns to use. Can either be: 378 - `None` to select all columns 379 - A list of strings and ints, the column names and/or indices 380 (starting at 0) 381 - A string, a comma separated list of Excel column letters and column 382 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 383 `A,B,C,D,E` and `A,C,E,F`) 384 - A callable, a function that takes a column and returns a boolean 385 indicating whether the column should be used 386 :param dtypes: An optional dtype (for all columns) 387 or dict of dtypes with keys as column indices or names. 388 """ 389 output = self._reader.load_table( # type:ignore[call-overload,misc] 390 name=name, 391 header_row=header_row, 392 column_names=column_names, 393 skip_rows=skip_rows, 394 n_rows=n_rows, 395 schema_sample_rows=schema_sample_rows, 396 dtype_coercion=dtype_coercion, 397 use_columns=use_columns, 398 dtypes=dtypes, 399 eager=eager, 400 ) 401 if eager: 402 return output 403 return ExcelTable(output)
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None
, the table's column names will be used. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
405 def load_sheet_eager( 406 self, 407 idx_or_name: int | str, 408 *, 409 header_row: int | None = 0, 410 column_names: list[str] | None = None, 411 skip_rows: int | None = None, 412 n_rows: int | None = None, 413 schema_sample_rows: int | None = 1_000, 414 dtype_coercion: Literal["coerce", "strict"] = "coerce", 415 use_columns: list[str] | list[int] | str | None = None, 416 dtypes: DType | DTypeMap | None = None, 417 ) -> pa.RecordBatch: 418 """Loads a sheet eagerly by index or name. 419 420 For xlsx files, this will be faster and more memory-efficient, as it will use 421 `worksheet_range_ref` under the hood, which returns borrowed types. 422 423 Refer to `load_sheet` for parameter documentation 424 """ 425 return self._reader.load_sheet( 426 idx_or_name=idx_or_name, 427 header_row=header_row, 428 column_names=column_names, 429 skip_rows=skip_rows, 430 n_rows=n_rows, 431 schema_sample_rows=schema_sample_rows, 432 dtype_coercion=dtype_coercion, 433 use_columns=use_columns, 434 dtypes=dtypes, 435 eager=True, 436 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref
under the hood, which returns borrowed types.
Refer to load_sheet
for parameter documentation
438 def load_sheet_by_name( 439 self, 440 name: str, 441 *, 442 header_row: int | None = 0, 443 column_names: list[str] | None = None, 444 skip_rows: int | None = None, 445 n_rows: int | None = None, 446 schema_sample_rows: int | None = 1_000, 447 dtype_coercion: Literal["coerce", "strict"] = "coerce", 448 use_columns: list[str] 449 | list[int] 450 | str 451 | Callable[[ColumnInfoNoDtype], bool] 452 | None = None, 453 dtypes: DType | DTypeMap | None = None, 454 ) -> ExcelSheet: 455 """Loads a sheet by name. 456 457 Refer to `load_sheet` for parameter documentation 458 """ 459 return self.load_sheet( 460 name, 461 header_row=header_row, 462 column_names=column_names, 463 skip_rows=skip_rows, 464 n_rows=n_rows, 465 schema_sample_rows=schema_sample_rows, 466 dtype_coercion=dtype_coercion, 467 use_columns=use_columns, 468 dtypes=dtypes, 469 )
Loads a sheet by name.
Refer to load_sheet
for parameter documentation
471 def load_sheet_by_idx( 472 self, 473 idx: int, 474 *, 475 header_row: int | None = 0, 476 column_names: list[str] | None = None, 477 skip_rows: int | None = None, 478 n_rows: int | None = None, 479 schema_sample_rows: int | None = 1_000, 480 dtype_coercion: Literal["coerce", "strict"] = "coerce", 481 use_columns: list[str] 482 | list[int] 483 | str 484 | Callable[[ColumnInfoNoDtype], bool] 485 | None = None, 486 dtypes: DType | DTypeMap | None = None, 487 ) -> ExcelSheet: 488 """Loads a sheet by index. 489 490 Refer to `load_sheet` for parameter documentation 491 """ 492 return self.load_sheet( 493 idx, 494 header_row=header_row, 495 column_names=column_names, 496 skip_rows=skip_rows, 497 n_rows=n_rows, 498 schema_sample_rows=schema_sample_rows, 499 dtype_coercion=dtype_coercion, 500 use_columns=use_columns, 501 dtypes=dtypes, 502 )
Loads a sheet by index.
Refer to load_sheet
for parameter documentation
58class ExcelSheet: 59 """A class representing a single sheet in an Excel File""" 60 61 def __init__(self, sheet: _ExcelSheet) -> None: 62 self._sheet = sheet 63 64 @property 65 def name(self) -> str: 66 """The name of the sheet""" 67 return self._sheet.name 68 69 @property 70 def width(self) -> int: 71 """The sheet's width""" 72 return self._sheet.width 73 74 @property 75 def height(self) -> int: 76 """The sheet's height, with `skip_rows` and `nrows` applied""" 77 return self._sheet.height 78 79 @property 80 def total_height(self) -> int: 81 """The sheet's total height""" 82 return self._sheet.total_height 83 84 @property 85 def selected_columns(self) -> list[ColumnInfo]: 86 """The sheet's selected columns""" 87 return self._sheet.selected_columns 88 89 def available_columns(self) -> list[ColumnInfo]: 90 """The columns available for the given sheet""" 91 return self._sheet.available_columns() 92 93 @property 94 def specified_dtypes(self) -> DTypeMap | None: 95 """The dtypes specified for the sheet""" 96 return self._sheet.specified_dtypes 97 98 @property 99 def visible(self) -> SheetVisible: 100 """The visibility of the sheet""" 101 return self._sheet.visible 102 103 def to_arrow(self) -> pa.RecordBatch: 104 """Converts the sheet to a pyarrow `RecordBatch`""" 105 return self._sheet.to_arrow() 106 107 def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]: 108 """Converts the sheet to a pyarrow `RecordBatch` with error information. 109 110 Stores the positions of any values that cannot be parsed as the specified type and were 111 therefore converted to None. 112 """ 113 rb, cell_errors = self._sheet.to_arrow_with_errors() 114 if not cell_errors.errors: 115 return (rb, None) 116 return (rb, cell_errors) 117 118 def to_pandas(self) -> "pd.DataFrame": 119 """Converts the sheet to a Pandas `DataFrame`. 120 121 Requires the `pandas` extra to be installed. 122 """ 123 # We know for sure that the sheet will yield exactly one RecordBatch 124 return self.to_arrow().to_pandas() 125 126 def to_polars(self) -> "pl.DataFrame": 127 """Converts the sheet to a Polars `DataFrame`. 128 129 Requires the `polars` extra to be installed. 130 """ 131 return _recordbatch_to_polars(self.to_arrow()) 132 133 def __repr__(self) -> str: 134 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
74 @property 75 def height(self) -> int: 76 """The sheet's height, with `skip_rows` and `nrows` applied""" 77 return self._sheet.height
The sheet's height, with skip_rows
and nrows
applied
79 @property 80 def total_height(self) -> int: 81 """The sheet's total height""" 82 return self._sheet.total_height
The sheet's total height
84 @property 85 def selected_columns(self) -> list[ColumnInfo]: 86 """The sheet's selected columns""" 87 return self._sheet.selected_columns
The sheet's selected columns
89 def available_columns(self) -> list[ColumnInfo]: 90 """The columns available for the given sheet""" 91 return self._sheet.available_columns()
The columns available for the given sheet
93 @property 94 def specified_dtypes(self) -> DTypeMap | None: 95 """The dtypes specified for the sheet""" 96 return self._sheet.specified_dtypes
The dtypes specified for the sheet
98 @property 99 def visible(self) -> SheetVisible: 100 """The visibility of the sheet""" 101 return self._sheet.visible
The visibility of the sheet
103 def to_arrow(self) -> pa.RecordBatch: 104 """Converts the sheet to a pyarrow `RecordBatch`""" 105 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
107 def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]: 108 """Converts the sheet to a pyarrow `RecordBatch` with error information. 109 110 Stores the positions of any values that cannot be parsed as the specified type and were 111 therefore converted to None. 112 """ 113 rb, cell_errors = self._sheet.to_arrow_with_errors() 114 if not cell_errors.errors: 115 return (rb, None) 116 return (rb, cell_errors)
Converts the sheet to a pyarrow RecordBatch
with error information.
Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None.
118 def to_pandas(self) -> "pd.DataFrame": 119 """Converts the sheet to a Pandas `DataFrame`. 120 121 Requires the `pandas` extra to be installed. 122 """ 123 # We know for sure that the sheet will yield exactly one RecordBatch 124 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame
.
Requires the pandas
extra to be installed.
126 def to_polars(self) -> "pl.DataFrame": 127 """Converts the sheet to a Polars `DataFrame`. 128 129 Requires the `polars` extra to be installed. 130 """ 131 return _recordbatch_to_polars(self.to_arrow())
Converts the sheet to a Polars DataFrame
.
Requires the polars
extra to be installed.
This class provides information about a single column in a sheet
fastexcel.ColumnNameFrom
. How the name of the column was determined.
One of three possible values:
"provided"
: The column name was provided via theuse_columns
parameter"looked_up"
: The column name was looked up from the data found in the sheet"generated"
: The column name was generated from the column index, either becauseheader_row
wasNone
, or because it could not be looked up
fastexcel.DTypeFrom
. How the dtype of the column was determined.
One of three possible values:
"provided_by_index"
: The dtype was specified via the column index"provided_by_name"
: The dtype was specified via the column name"guessed"
: The dtype was determined from the content of the column
The base class for all fastexcel errors
Data for a given cell cannot be retrieved
calamine returned an error regarding the content of the cell
Generic calamine error
Sheet was not found
Column was not found
Generic arrow error
Provided parameters are invalid
Column contains an unsupported type combination