fastexcel
1from __future__ import annotations 2 3import sys 4import typing 5from typing import TYPE_CHECKING, Callable, Literal 6 7if sys.version_info < (3, 10): 8 from typing_extensions import TypeAlias 9else: 10 from typing import TypeAlias 11 12if TYPE_CHECKING: 13 import pandas as pd 14 import polars as pl 15 16from os.path import expanduser 17from pathlib import Path 18 19import pyarrow as pa 20 21from ._fastexcel import ( 22 ArrowError, 23 CalamineCellError, 24 CalamineError, 25 CannotRetrieveCellDataError, 26 ColumnInfo, 27 ColumnInfoNoDtype, 28 ColumnNotFoundError, 29 FastExcelError, 30 InvalidParametersError, 31 SheetNotFoundError, 32 UnsupportedColumnTypeCombinationError, 33 __version__, 34 _ExcelReader, 35 _ExcelSheet, 36 _ExcelTable, 37) 38from ._fastexcel import read_excel as _read_excel 39 40DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 41DTypeMap: TypeAlias = "dict[str | int, DType]" 42ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 43DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"] 44SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 45 46 47def _recordbatch_to_polars(rb: pa.RecordBatch) -> pl.DataFrame: 48 import polars as pl 49 50 df = pl.from_arrow(data=rb) 51 assert isinstance(df, pl.DataFrame) 52 return df 53 54 55class ExcelSheet: 56 """A class representing a single sheet in an Excel File""" 57 58 def __init__(self, sheet: _ExcelSheet) -> None: 59 self._sheet = sheet 60 61 @property 62 def name(self) -> str: 63 """The name of the sheet""" 64 return self._sheet.name 65 66 @property 67 def width(self) -> int: 68 """The sheet's width""" 69 return self._sheet.width 70 71 @property 72 def height(self) -> int: 73 """The sheet's height, with `skip_rows` and `nrows` applied""" 74 return self._sheet.height 75 76 @property 77 def total_height(self) -> int: 78 """The sheet's total height""" 79 return self._sheet.total_height 80 81 @property 82 def selected_columns(self) -> list[ColumnInfo]: 83 """The sheet's selected columns""" 84 return self._sheet.selected_columns 85 86 def available_columns(self) -> list[ColumnInfo]: 87 """The columns available for the given sheet""" 88 return self._sheet.available_columns() 89 90 @property 91 def specified_dtypes(self) -> DTypeMap | None: 92 """The dtypes specified for the sheet""" 93 return self._sheet.specified_dtypes 94 95 @property 96 def visible(self) -> SheetVisible: 97 """The visibility of the sheet""" 98 return self._sheet.visible 99 100 def to_arrow(self) -> pa.RecordBatch: 101 """Converts the sheet to a pyarrow `RecordBatch`""" 102 return self._sheet.to_arrow() 103 104 def to_pandas(self) -> "pd.DataFrame": 105 """Converts the sheet to a Pandas `DataFrame`. 106 107 Requires the `pandas` extra to be installed. 108 """ 109 # We know for sure that the sheet will yield exactly one RecordBatch 110 return self.to_arrow().to_pandas() 111 112 def to_polars(self) -> "pl.DataFrame": 113 """Converts the sheet to a Polars `DataFrame`. 114 115 Requires the `polars` extra to be installed. 116 """ 117 return _recordbatch_to_polars(self.to_arrow()) 118 119 def __repr__(self) -> str: 120 return self._sheet.__repr__() 121 122 123class ExcelTable: 124 """A class representing a single table in an Excel file""" 125 126 def __init__(self, table: _ExcelTable) -> None: 127 self._table = table 128 129 @property 130 def name(self) -> str: 131 """The name of the table""" 132 return self._table.name 133 134 @property 135 def sheet_name(self) -> str: 136 """The name of the sheet this table belongs to""" 137 return self._table.sheet_name 138 139 @property 140 def width(self) -> int: 141 """The table's width""" 142 return self._table.width 143 144 @property 145 def height(self) -> int: 146 """The table's height""" 147 return self._table.height 148 149 @property 150 def total_height(self) -> int: 151 """The table's total height""" 152 return self._table.total_height 153 154 @property 155 def offset(self) -> int: 156 """The table's offset before data starts""" 157 return self._table.offset 158 159 @property 160 def selected_columns(self) -> list[ColumnInfo]: 161 """The table's selected columns""" 162 return self._table.selected_columns 163 164 def available_columns(self) -> list[ColumnInfo]: 165 """The columns available for the given table""" 166 return self._table.available_columns() 167 168 @property 169 def specified_dtypes(self) -> DTypeMap | None: 170 """The dtypes specified for the table""" 171 return self._table.specified_dtypes 172 173 def to_arrow(self) -> pa.RecordBatch: 174 """Converts the table to a pyarrow `RecordBatch`""" 175 return self._table.to_arrow() 176 177 def to_pandas(self) -> "pd.DataFrame": 178 """Converts the table to a Pandas `DataFrame`. 179 180 Requires the `pandas` extra to be installed. 181 """ 182 # We know for sure that the table will yield exactly one RecordBatch 183 return self.to_arrow().to_pandas() 184 185 def to_polars(self) -> "pl.DataFrame": 186 """Converts the table to a Polars `DataFrame`. 187 188 Requires the `polars` extra to be installed. 189 """ 190 return _recordbatch_to_polars(self.to_arrow()) 191 192 193class ExcelReader: 194 """A class representing an open Excel file and allowing to read its sheets""" 195 196 def __init__(self, reader: _ExcelReader) -> None: 197 self._reader = reader 198 199 @property 200 def sheet_names(self) -> list[str]: 201 """The list of sheet names""" 202 return self._reader.sheet_names 203 204 def load_sheet( 205 self, 206 idx_or_name: int | str, 207 *, 208 header_row: int | None = 0, 209 column_names: list[str] | None = None, 210 skip_rows: int | None = None, 211 n_rows: int | None = None, 212 schema_sample_rows: int | None = 1_000, 213 dtype_coercion: Literal["coerce", "strict"] = "coerce", 214 use_columns: list[str] 215 | list[int] 216 | str 217 | Callable[[ColumnInfoNoDtype], bool] 218 | None = None, 219 dtypes: DType | DTypeMap | None = None, 220 ) -> ExcelSheet: 221 """Loads a sheet lazily by index or name. 222 223 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 224 :param header_row: The index of the row containing the column labels, default index is 0. 225 If `None`, the sheet does not have any column labels. 226 Any rows before the `header_row` will be automatically skipped. 227 :param column_names: Overrides headers found in the document. 228 If `column_names` is used, `header_row` will be ignored. 229 :param n_rows: Specifies how many rows should be loaded. 230 If `None`, all rows are loaded 231 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 232 Any rows before the `header_row` are automatically skipped. 233 If `header_row` is `None`: 234 - if `skip_rows` is `None` (default): it skips all empty rows 235 at the beginning of the sheet. 236 - if `skip_rows` is a number, it skips the specified number 237 of rows from the start of the sheet. 238 :param schema_sample_rows: Specifies how many rows should be used to determine 239 the dtype of a column. Cannot be 0. A specific dtype can be 240 enforced for some or all columns through the `dtypes` parameter. 241 If `None`, all rows will be used. 242 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 243 will try to coerce different dtypes in a column to the same one, 244 whereas `strict` will raise an error in case a column contains 245 several dtypes. Note that this only applies to columns whose dtype 246 is guessed, i.e. not specified via `dtypes`. 247 :param use_columns: Specifies the columns to use. Can either be: 248 - `None` to select all columns 249 - A list of strings and ints, the column names and/or indices 250 (starting at 0) 251 - A string, a comma separated list of Excel column letters and column 252 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 253 `A,B,C,D,E` and `A,C,E,F`) 254 - A callable, a function that takes a column and returns a boolean 255 indicating whether the column should be used 256 :param dtypes: An optional dtype (for all columns) 257 or dict of dtypes with keys as column indices or names. 258 """ 259 return ExcelSheet( 260 self._reader.load_sheet( 261 idx_or_name=idx_or_name, 262 header_row=header_row, 263 column_names=column_names, 264 skip_rows=skip_rows, 265 n_rows=n_rows, 266 schema_sample_rows=schema_sample_rows, 267 dtype_coercion=dtype_coercion, 268 use_columns=use_columns, 269 dtypes=dtypes, 270 eager=False, 271 ) 272 ) 273 274 def table_names(self, sheet_name: str | None = None) -> list[str]: 275 """The list of table names. 276 277 Will return an empty list if no tables are found. 278 279 :param sheet_name: If given, will limit the list to the given sheet, will be faster 280 too. 281 """ 282 return self._reader.table_names(sheet_name) 283 284 @typing.overload 285 def load_table( 286 self, 287 name: str, 288 *, 289 header_row: int | None = None, 290 column_names: list[str] | None = None, 291 skip_rows: int = 0, 292 n_rows: int | None = None, 293 schema_sample_rows: int | None = 1_000, 294 dtype_coercion: Literal["coerce", "strict"] = "coerce", 295 use_columns: list[str] 296 | list[int] 297 | str 298 | Callable[[ColumnInfoNoDtype], bool] 299 | None = None, 300 dtypes: DType | DTypeMap | None = None, 301 eager: Literal[False] = ..., 302 ) -> ExcelTable: ... 303 @typing.overload 304 def load_table( 305 self, 306 name: str, 307 *, 308 header_row: int | None = None, 309 column_names: list[str] | None = None, 310 skip_rows: int = 0, 311 n_rows: int | None = None, 312 schema_sample_rows: int | None = 1_000, 313 dtype_coercion: Literal["coerce", "strict"] = "coerce", 314 use_columns: list[str] 315 | list[int] 316 | str 317 | Callable[[ColumnInfoNoDtype], bool] 318 | None = None, 319 dtypes: DType | DTypeMap | None = None, 320 eager: Literal[True] = ..., 321 ) -> pa.RecordBatch: ... 322 def load_table( 323 self, 324 name: str, 325 *, 326 header_row: int | None = None, 327 column_names: list[str] | None = None, 328 skip_rows: int = 0, 329 n_rows: int | None = None, 330 schema_sample_rows: int | None = 1_000, 331 dtype_coercion: Literal["coerce", "strict"] = "coerce", 332 use_columns: list[str] 333 | list[int] 334 | str 335 | Callable[[ColumnInfoNoDtype], bool] 336 | None = None, 337 dtypes: DType | DTypeMap | None = None, 338 eager: bool = False, 339 ) -> ExcelTable | pa.RecordBatch: 340 """Loads a table by name. 341 342 :param name: The name of the table to load. 343 :param header_row: The index of the row containing the column labels. 344 If `None`, the table's column names will be used. 345 Any rows before the `header_row` will be automatically skipped. 346 :param column_names: Overrides headers found in the document. 347 If `column_names` is used, `header_row` will be ignored. 348 :param n_rows: Specifies how many rows should be loaded. 349 If `None`, all rows are loaded 350 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 351 Any rows before the `header_row` are automatically skipped. 352 If `header_row` is `None`, it skips the number of rows from the 353 start of the sheet. 354 :param schema_sample_rows: Specifies how many rows should be used to determine 355 the dtype of a column. Cannot be 0. A specific dtype can be 356 enforced for some or all columns through the `dtypes` parameter. 357 If `None`, all rows will be used. 358 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 359 will try to coerce different dtypes in a column to the same one, 360 whereas `strict` will raise an error in case a column contains 361 several dtypes. Note that this only applies to columns whose dtype 362 is guessed, i.e. not specified via `dtypes`. 363 :param use_columns: Specifies the columns to use. Can either be: 364 - `None` to select all columns 365 - A list of strings and ints, the column names and/or indices 366 (starting at 0) 367 - A string, a comma separated list of Excel column letters and column 368 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 369 `A,B,C,D,E` and `A,C,E,F`) 370 - A callable, a function that takes a column and returns a boolean 371 indicating whether the column should be used 372 :param dtypes: An optional dtype (for all columns) 373 or dict of dtypes with keys as column indices or names. 374 """ 375 output = self._reader.load_table( # type:ignore[call-overload,misc] 376 name=name, 377 header_row=header_row, 378 column_names=column_names, 379 skip_rows=skip_rows, 380 n_rows=n_rows, 381 schema_sample_rows=schema_sample_rows, 382 dtype_coercion=dtype_coercion, 383 use_columns=use_columns, 384 dtypes=dtypes, 385 eager=eager, 386 ) 387 if eager: 388 return output 389 return ExcelTable(output) 390 391 def load_sheet_eager( 392 self, 393 idx_or_name: int | str, 394 *, 395 header_row: int | None = 0, 396 column_names: list[str] | None = None, 397 skip_rows: int | None = None, 398 n_rows: int | None = None, 399 schema_sample_rows: int | None = 1_000, 400 dtype_coercion: Literal["coerce", "strict"] = "coerce", 401 use_columns: list[str] | list[int] | str | None = None, 402 dtypes: DType | DTypeMap | None = None, 403 ) -> pa.RecordBatch: 404 """Loads a sheet eagerly by index or name. 405 406 For xlsx files, this will be faster and more memory-efficient, as it will use 407 `worksheet_range_ref` under the hood, which returns borrowed types. 408 409 Refer to `load_sheet` for parameter documentation 410 """ 411 return self._reader.load_sheet( 412 idx_or_name=idx_or_name, 413 header_row=header_row, 414 column_names=column_names, 415 skip_rows=skip_rows, 416 n_rows=n_rows, 417 schema_sample_rows=schema_sample_rows, 418 dtype_coercion=dtype_coercion, 419 use_columns=use_columns, 420 dtypes=dtypes, 421 eager=True, 422 ) 423 424 def load_sheet_by_name( 425 self, 426 name: str, 427 *, 428 header_row: int | None = 0, 429 column_names: list[str] | None = None, 430 skip_rows: int | None = None, 431 n_rows: int | None = None, 432 schema_sample_rows: int | None = 1_000, 433 dtype_coercion: Literal["coerce", "strict"] = "coerce", 434 use_columns: list[str] 435 | list[int] 436 | str 437 | Callable[[ColumnInfoNoDtype], bool] 438 | None = None, 439 dtypes: DType | DTypeMap | None = None, 440 ) -> ExcelSheet: 441 """Loads a sheet by name. 442 443 Refer to `load_sheet` for parameter documentation 444 """ 445 return self.load_sheet( 446 name, 447 header_row=header_row, 448 column_names=column_names, 449 skip_rows=skip_rows, 450 n_rows=n_rows, 451 schema_sample_rows=schema_sample_rows, 452 dtype_coercion=dtype_coercion, 453 use_columns=use_columns, 454 dtypes=dtypes, 455 ) 456 457 def load_sheet_by_idx( 458 self, 459 idx: int, 460 *, 461 header_row: int | None = 0, 462 column_names: list[str] | None = None, 463 skip_rows: int | None = None, 464 n_rows: int | None = None, 465 schema_sample_rows: int | None = 1_000, 466 dtype_coercion: Literal["coerce", "strict"] = "coerce", 467 use_columns: list[str] 468 | list[int] 469 | str 470 | Callable[[ColumnInfoNoDtype], bool] 471 | None = None, 472 dtypes: DType | DTypeMap | None = None, 473 ) -> ExcelSheet: 474 """Loads a sheet by index. 475 476 Refer to `load_sheet` for parameter documentation 477 """ 478 return self.load_sheet( 479 idx, 480 header_row=header_row, 481 column_names=column_names, 482 skip_rows=skip_rows, 483 n_rows=n_rows, 484 schema_sample_rows=schema_sample_rows, 485 dtype_coercion=dtype_coercion, 486 use_columns=use_columns, 487 dtypes=dtypes, 488 ) 489 490 def __repr__(self) -> str: 491 return self._reader.__repr__() 492 493 494def read_excel(source: Path | str | bytes) -> ExcelReader: 495 """Opens and loads an excel file. 496 497 :param source: The path to a file or its content as bytes 498 """ 499 if isinstance(source, (str, Path)): 500 source = expanduser(source) 501 return ExcelReader(_read_excel(source)) 502 503 504__all__ = ( 505 ## version 506 "__version__", 507 ## main entrypoint 508 "read_excel", 509 ## Python types 510 "DType", 511 "DTypeMap", 512 # Excel reader 513 "ExcelReader", 514 # Excel sheet 515 "ExcelSheet", 516 # Column metadata 517 "DTypeFrom", 518 "ColumnNameFrom", 519 "ColumnInfo", 520 # Exceptions 521 "FastExcelError", 522 "CannotRetrieveCellDataError", 523 "CalamineCellError", 524 "CalamineError", 525 "SheetNotFoundError", 526 "ColumnNotFoundError", 527 "ArrowError", 528 "InvalidParametersError", 529 "UnsupportedColumnTypeCombinationError", 530)
495def read_excel(source: Path | str | bytes) -> ExcelReader: 496 """Opens and loads an excel file. 497 498 :param source: The path to a file or its content as bytes 499 """ 500 if isinstance(source, (str, Path)): 501 source = expanduser(source) 502 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
194class ExcelReader: 195 """A class representing an open Excel file and allowing to read its sheets""" 196 197 def __init__(self, reader: _ExcelReader) -> None: 198 self._reader = reader 199 200 @property 201 def sheet_names(self) -> list[str]: 202 """The list of sheet names""" 203 return self._reader.sheet_names 204 205 def load_sheet( 206 self, 207 idx_or_name: int | str, 208 *, 209 header_row: int | None = 0, 210 column_names: list[str] | None = None, 211 skip_rows: int | None = None, 212 n_rows: int | None = None, 213 schema_sample_rows: int | None = 1_000, 214 dtype_coercion: Literal["coerce", "strict"] = "coerce", 215 use_columns: list[str] 216 | list[int] 217 | str 218 | Callable[[ColumnInfoNoDtype], bool] 219 | None = None, 220 dtypes: DType | DTypeMap | None = None, 221 ) -> ExcelSheet: 222 """Loads a sheet lazily by index or name. 223 224 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 225 :param header_row: The index of the row containing the column labels, default index is 0. 226 If `None`, the sheet does not have any column labels. 227 Any rows before the `header_row` will be automatically skipped. 228 :param column_names: Overrides headers found in the document. 229 If `column_names` is used, `header_row` will be ignored. 230 :param n_rows: Specifies how many rows should be loaded. 231 If `None`, all rows are loaded 232 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 233 Any rows before the `header_row` are automatically skipped. 234 If `header_row` is `None`: 235 - if `skip_rows` is `None` (default): it skips all empty rows 236 at the beginning of the sheet. 237 - if `skip_rows` is a number, it skips the specified number 238 of rows from the start of the sheet. 239 :param schema_sample_rows: Specifies how many rows should be used to determine 240 the dtype of a column. Cannot be 0. A specific dtype can be 241 enforced for some or all columns through the `dtypes` parameter. 242 If `None`, all rows will be used. 243 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 244 will try to coerce different dtypes in a column to the same one, 245 whereas `strict` will raise an error in case a column contains 246 several dtypes. Note that this only applies to columns whose dtype 247 is guessed, i.e. not specified via `dtypes`. 248 :param use_columns: Specifies the columns to use. Can either be: 249 - `None` to select all columns 250 - A list of strings and ints, the column names and/or indices 251 (starting at 0) 252 - A string, a comma separated list of Excel column letters and column 253 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 254 `A,B,C,D,E` and `A,C,E,F`) 255 - A callable, a function that takes a column and returns a boolean 256 indicating whether the column should be used 257 :param dtypes: An optional dtype (for all columns) 258 or dict of dtypes with keys as column indices or names. 259 """ 260 return ExcelSheet( 261 self._reader.load_sheet( 262 idx_or_name=idx_or_name, 263 header_row=header_row, 264 column_names=column_names, 265 skip_rows=skip_rows, 266 n_rows=n_rows, 267 schema_sample_rows=schema_sample_rows, 268 dtype_coercion=dtype_coercion, 269 use_columns=use_columns, 270 dtypes=dtypes, 271 eager=False, 272 ) 273 ) 274 275 def table_names(self, sheet_name: str | None = None) -> list[str]: 276 """The list of table names. 277 278 Will return an empty list if no tables are found. 279 280 :param sheet_name: If given, will limit the list to the given sheet, will be faster 281 too. 282 """ 283 return self._reader.table_names(sheet_name) 284 285 @typing.overload 286 def load_table( 287 self, 288 name: str, 289 *, 290 header_row: int | None = None, 291 column_names: list[str] | None = None, 292 skip_rows: int = 0, 293 n_rows: int | None = None, 294 schema_sample_rows: int | None = 1_000, 295 dtype_coercion: Literal["coerce", "strict"] = "coerce", 296 use_columns: list[str] 297 | list[int] 298 | str 299 | Callable[[ColumnInfoNoDtype], bool] 300 | None = None, 301 dtypes: DType | DTypeMap | None = None, 302 eager: Literal[False] = ..., 303 ) -> ExcelTable: ... 304 @typing.overload 305 def load_table( 306 self, 307 name: str, 308 *, 309 header_row: int | None = None, 310 column_names: list[str] | None = None, 311 skip_rows: int = 0, 312 n_rows: int | None = None, 313 schema_sample_rows: int | None = 1_000, 314 dtype_coercion: Literal["coerce", "strict"] = "coerce", 315 use_columns: list[str] 316 | list[int] 317 | str 318 | Callable[[ColumnInfoNoDtype], bool] 319 | None = None, 320 dtypes: DType | DTypeMap | None = None, 321 eager: Literal[True] = ..., 322 ) -> pa.RecordBatch: ... 323 def load_table( 324 self, 325 name: str, 326 *, 327 header_row: int | None = None, 328 column_names: list[str] | None = None, 329 skip_rows: int = 0, 330 n_rows: int | None = None, 331 schema_sample_rows: int | None = 1_000, 332 dtype_coercion: Literal["coerce", "strict"] = "coerce", 333 use_columns: list[str] 334 | list[int] 335 | str 336 | Callable[[ColumnInfoNoDtype], bool] 337 | None = None, 338 dtypes: DType | DTypeMap | None = None, 339 eager: bool = False, 340 ) -> ExcelTable | pa.RecordBatch: 341 """Loads a table by name. 342 343 :param name: The name of the table to load. 344 :param header_row: The index of the row containing the column labels. 345 If `None`, the table's column names will be used. 346 Any rows before the `header_row` will be automatically skipped. 347 :param column_names: Overrides headers found in the document. 348 If `column_names` is used, `header_row` will be ignored. 349 :param n_rows: Specifies how many rows should be loaded. 350 If `None`, all rows are loaded 351 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 352 Any rows before the `header_row` are automatically skipped. 353 If `header_row` is `None`, it skips the number of rows from the 354 start of the sheet. 355 :param schema_sample_rows: Specifies how many rows should be used to determine 356 the dtype of a column. Cannot be 0. A specific dtype can be 357 enforced for some or all columns through the `dtypes` parameter. 358 If `None`, all rows will be used. 359 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 360 will try to coerce different dtypes in a column to the same one, 361 whereas `strict` will raise an error in case a column contains 362 several dtypes. Note that this only applies to columns whose dtype 363 is guessed, i.e. not specified via `dtypes`. 364 :param use_columns: Specifies the columns to use. Can either be: 365 - `None` to select all columns 366 - A list of strings and ints, the column names and/or indices 367 (starting at 0) 368 - A string, a comma separated list of Excel column letters and column 369 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 370 `A,B,C,D,E` and `A,C,E,F`) 371 - A callable, a function that takes a column and returns a boolean 372 indicating whether the column should be used 373 :param dtypes: An optional dtype (for all columns) 374 or dict of dtypes with keys as column indices or names. 375 """ 376 output = self._reader.load_table( # type:ignore[call-overload,misc] 377 name=name, 378 header_row=header_row, 379 column_names=column_names, 380 skip_rows=skip_rows, 381 n_rows=n_rows, 382 schema_sample_rows=schema_sample_rows, 383 dtype_coercion=dtype_coercion, 384 use_columns=use_columns, 385 dtypes=dtypes, 386 eager=eager, 387 ) 388 if eager: 389 return output 390 return ExcelTable(output) 391 392 def load_sheet_eager( 393 self, 394 idx_or_name: int | str, 395 *, 396 header_row: int | None = 0, 397 column_names: list[str] | None = None, 398 skip_rows: int | None = None, 399 n_rows: int | None = None, 400 schema_sample_rows: int | None = 1_000, 401 dtype_coercion: Literal["coerce", "strict"] = "coerce", 402 use_columns: list[str] | list[int] | str | None = None, 403 dtypes: DType | DTypeMap | None = None, 404 ) -> pa.RecordBatch: 405 """Loads a sheet eagerly by index or name. 406 407 For xlsx files, this will be faster and more memory-efficient, as it will use 408 `worksheet_range_ref` under the hood, which returns borrowed types. 409 410 Refer to `load_sheet` for parameter documentation 411 """ 412 return self._reader.load_sheet( 413 idx_or_name=idx_or_name, 414 header_row=header_row, 415 column_names=column_names, 416 skip_rows=skip_rows, 417 n_rows=n_rows, 418 schema_sample_rows=schema_sample_rows, 419 dtype_coercion=dtype_coercion, 420 use_columns=use_columns, 421 dtypes=dtypes, 422 eager=True, 423 ) 424 425 def load_sheet_by_name( 426 self, 427 name: str, 428 *, 429 header_row: int | None = 0, 430 column_names: list[str] | None = None, 431 skip_rows: int | None = None, 432 n_rows: int | None = None, 433 schema_sample_rows: int | None = 1_000, 434 dtype_coercion: Literal["coerce", "strict"] = "coerce", 435 use_columns: list[str] 436 | list[int] 437 | str 438 | Callable[[ColumnInfoNoDtype], bool] 439 | None = None, 440 dtypes: DType | DTypeMap | None = None, 441 ) -> ExcelSheet: 442 """Loads a sheet by name. 443 444 Refer to `load_sheet` for parameter documentation 445 """ 446 return self.load_sheet( 447 name, 448 header_row=header_row, 449 column_names=column_names, 450 skip_rows=skip_rows, 451 n_rows=n_rows, 452 schema_sample_rows=schema_sample_rows, 453 dtype_coercion=dtype_coercion, 454 use_columns=use_columns, 455 dtypes=dtypes, 456 ) 457 458 def load_sheet_by_idx( 459 self, 460 idx: int, 461 *, 462 header_row: int | None = 0, 463 column_names: list[str] | None = None, 464 skip_rows: int | None = None, 465 n_rows: int | None = None, 466 schema_sample_rows: int | None = 1_000, 467 dtype_coercion: Literal["coerce", "strict"] = "coerce", 468 use_columns: list[str] 469 | list[int] 470 | str 471 | Callable[[ColumnInfoNoDtype], bool] 472 | None = None, 473 dtypes: DType | DTypeMap | None = None, 474 ) -> ExcelSheet: 475 """Loads a sheet by index. 476 477 Refer to `load_sheet` for parameter documentation 478 """ 479 return self.load_sheet( 480 idx, 481 header_row=header_row, 482 column_names=column_names, 483 skip_rows=skip_rows, 484 n_rows=n_rows, 485 schema_sample_rows=schema_sample_rows, 486 dtype_coercion=dtype_coercion, 487 use_columns=use_columns, 488 dtypes=dtypes, 489 ) 490 491 def __repr__(self) -> str: 492 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
200 @property 201 def sheet_names(self) -> list[str]: 202 """The list of sheet names""" 203 return self._reader.sheet_names
The list of sheet names
205 def load_sheet( 206 self, 207 idx_or_name: int | str, 208 *, 209 header_row: int | None = 0, 210 column_names: list[str] | None = None, 211 skip_rows: int | None = None, 212 n_rows: int | None = None, 213 schema_sample_rows: int | None = 1_000, 214 dtype_coercion: Literal["coerce", "strict"] = "coerce", 215 use_columns: list[str] 216 | list[int] 217 | str 218 | Callable[[ColumnInfoNoDtype], bool] 219 | None = None, 220 dtypes: DType | DTypeMap | None = None, 221 ) -> ExcelSheet: 222 """Loads a sheet lazily by index or name. 223 224 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 225 :param header_row: The index of the row containing the column labels, default index is 0. 226 If `None`, the sheet does not have any column labels. 227 Any rows before the `header_row` will be automatically skipped. 228 :param column_names: Overrides headers found in the document. 229 If `column_names` is used, `header_row` will be ignored. 230 :param n_rows: Specifies how many rows should be loaded. 231 If `None`, all rows are loaded 232 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 233 Any rows before the `header_row` are automatically skipped. 234 If `header_row` is `None`: 235 - if `skip_rows` is `None` (default): it skips all empty rows 236 at the beginning of the sheet. 237 - if `skip_rows` is a number, it skips the specified number 238 of rows from the start of the sheet. 239 :param schema_sample_rows: Specifies how many rows should be used to determine 240 the dtype of a column. Cannot be 0. A specific dtype can be 241 enforced for some or all columns through the `dtypes` parameter. 242 If `None`, all rows will be used. 243 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 244 will try to coerce different dtypes in a column to the same one, 245 whereas `strict` will raise an error in case a column contains 246 several dtypes. Note that this only applies to columns whose dtype 247 is guessed, i.e. not specified via `dtypes`. 248 :param use_columns: Specifies the columns to use. Can either be: 249 - `None` to select all columns 250 - A list of strings and ints, the column names and/or indices 251 (starting at 0) 252 - A string, a comma separated list of Excel column letters and column 253 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 254 `A,B,C,D,E` and `A,C,E,F`) 255 - A callable, a function that takes a column and returns a boolean 256 indicating whether the column should be used 257 :param dtypes: An optional dtype (for all columns) 258 or dict of dtypes with keys as column indices or names. 259 """ 260 return ExcelSheet( 261 self._reader.load_sheet( 262 idx_or_name=idx_or_name, 263 header_row=header_row, 264 column_names=column_names, 265 skip_rows=skip_rows, 266 n_rows=n_rows, 267 schema_sample_rows=schema_sample_rows, 268 dtype_coercion=dtype_coercion, 269 use_columns=use_columns, 270 dtypes=dtypes, 271 eager=False, 272 ) 273 )
Loads a sheet lazily by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None
, the sheet does not have any column labels. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
:- if
skip_rows
isNone
(default): it skips all empty rows at the beginning of the sheet. - if
skip_rows
is a number, it skips the specified number of rows from the start of the sheet.
- if
- schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
275 def table_names(self, sheet_name: str | None = None) -> list[str]: 276 """The list of table names. 277 278 Will return an empty list if no tables are found. 279 280 :param sheet_name: If given, will limit the list to the given sheet, will be faster 281 too. 282 """ 283 return self._reader.table_names(sheet_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_name: If given, will limit the list to the given sheet, will be faster too.
323 def load_table( 324 self, 325 name: str, 326 *, 327 header_row: int | None = None, 328 column_names: list[str] | None = None, 329 skip_rows: int = 0, 330 n_rows: int | None = None, 331 schema_sample_rows: int | None = 1_000, 332 dtype_coercion: Literal["coerce", "strict"] = "coerce", 333 use_columns: list[str] 334 | list[int] 335 | str 336 | Callable[[ColumnInfoNoDtype], bool] 337 | None = None, 338 dtypes: DType | DTypeMap | None = None, 339 eager: bool = False, 340 ) -> ExcelTable | pa.RecordBatch: 341 """Loads a table by name. 342 343 :param name: The name of the table to load. 344 :param header_row: The index of the row containing the column labels. 345 If `None`, the table's column names will be used. 346 Any rows before the `header_row` will be automatically skipped. 347 :param column_names: Overrides headers found in the document. 348 If `column_names` is used, `header_row` will be ignored. 349 :param n_rows: Specifies how many rows should be loaded. 350 If `None`, all rows are loaded 351 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 352 Any rows before the `header_row` are automatically skipped. 353 If `header_row` is `None`, it skips the number of rows from the 354 start of the sheet. 355 :param schema_sample_rows: Specifies how many rows should be used to determine 356 the dtype of a column. Cannot be 0. A specific dtype can be 357 enforced for some or all columns through the `dtypes` parameter. 358 If `None`, all rows will be used. 359 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 360 will try to coerce different dtypes in a column to the same one, 361 whereas `strict` will raise an error in case a column contains 362 several dtypes. Note that this only applies to columns whose dtype 363 is guessed, i.e. not specified via `dtypes`. 364 :param use_columns: Specifies the columns to use. Can either be: 365 - `None` to select all columns 366 - A list of strings and ints, the column names and/or indices 367 (starting at 0) 368 - A string, a comma separated list of Excel column letters and column 369 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 370 `A,B,C,D,E` and `A,C,E,F`) 371 - A callable, a function that takes a column and returns a boolean 372 indicating whether the column should be used 373 :param dtypes: An optional dtype (for all columns) 374 or dict of dtypes with keys as column indices or names. 375 """ 376 output = self._reader.load_table( # type:ignore[call-overload,misc] 377 name=name, 378 header_row=header_row, 379 column_names=column_names, 380 skip_rows=skip_rows, 381 n_rows=n_rows, 382 schema_sample_rows=schema_sample_rows, 383 dtype_coercion=dtype_coercion, 384 use_columns=use_columns, 385 dtypes=dtypes, 386 eager=eager, 387 ) 388 if eager: 389 return output 390 return ExcelTable(output)
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None
, the table's column names will be used. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
392 def load_sheet_eager( 393 self, 394 idx_or_name: int | str, 395 *, 396 header_row: int | None = 0, 397 column_names: list[str] | None = None, 398 skip_rows: int | None = None, 399 n_rows: int | None = None, 400 schema_sample_rows: int | None = 1_000, 401 dtype_coercion: Literal["coerce", "strict"] = "coerce", 402 use_columns: list[str] | list[int] | str | None = None, 403 dtypes: DType | DTypeMap | None = None, 404 ) -> pa.RecordBatch: 405 """Loads a sheet eagerly by index or name. 406 407 For xlsx files, this will be faster and more memory-efficient, as it will use 408 `worksheet_range_ref` under the hood, which returns borrowed types. 409 410 Refer to `load_sheet` for parameter documentation 411 """ 412 return self._reader.load_sheet( 413 idx_or_name=idx_or_name, 414 header_row=header_row, 415 column_names=column_names, 416 skip_rows=skip_rows, 417 n_rows=n_rows, 418 schema_sample_rows=schema_sample_rows, 419 dtype_coercion=dtype_coercion, 420 use_columns=use_columns, 421 dtypes=dtypes, 422 eager=True, 423 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref
under the hood, which returns borrowed types.
Refer to load_sheet
for parameter documentation
425 def load_sheet_by_name( 426 self, 427 name: str, 428 *, 429 header_row: int | None = 0, 430 column_names: list[str] | None = None, 431 skip_rows: int | None = None, 432 n_rows: int | None = None, 433 schema_sample_rows: int | None = 1_000, 434 dtype_coercion: Literal["coerce", "strict"] = "coerce", 435 use_columns: list[str] 436 | list[int] 437 | str 438 | Callable[[ColumnInfoNoDtype], bool] 439 | None = None, 440 dtypes: DType | DTypeMap | None = None, 441 ) -> ExcelSheet: 442 """Loads a sheet by name. 443 444 Refer to `load_sheet` for parameter documentation 445 """ 446 return self.load_sheet( 447 name, 448 header_row=header_row, 449 column_names=column_names, 450 skip_rows=skip_rows, 451 n_rows=n_rows, 452 schema_sample_rows=schema_sample_rows, 453 dtype_coercion=dtype_coercion, 454 use_columns=use_columns, 455 dtypes=dtypes, 456 )
Loads a sheet by name.
Refer to load_sheet
for parameter documentation
458 def load_sheet_by_idx( 459 self, 460 idx: int, 461 *, 462 header_row: int | None = 0, 463 column_names: list[str] | None = None, 464 skip_rows: int | None = None, 465 n_rows: int | None = None, 466 schema_sample_rows: int | None = 1_000, 467 dtype_coercion: Literal["coerce", "strict"] = "coerce", 468 use_columns: list[str] 469 | list[int] 470 | str 471 | Callable[[ColumnInfoNoDtype], bool] 472 | None = None, 473 dtypes: DType | DTypeMap | None = None, 474 ) -> ExcelSheet: 475 """Loads a sheet by index. 476 477 Refer to `load_sheet` for parameter documentation 478 """ 479 return self.load_sheet( 480 idx, 481 header_row=header_row, 482 column_names=column_names, 483 skip_rows=skip_rows, 484 n_rows=n_rows, 485 schema_sample_rows=schema_sample_rows, 486 dtype_coercion=dtype_coercion, 487 use_columns=use_columns, 488 dtypes=dtypes, 489 )
Loads a sheet by index.
Refer to load_sheet
for parameter documentation
56class ExcelSheet: 57 """A class representing a single sheet in an Excel File""" 58 59 def __init__(self, sheet: _ExcelSheet) -> None: 60 self._sheet = sheet 61 62 @property 63 def name(self) -> str: 64 """The name of the sheet""" 65 return self._sheet.name 66 67 @property 68 def width(self) -> int: 69 """The sheet's width""" 70 return self._sheet.width 71 72 @property 73 def height(self) -> int: 74 """The sheet's height, with `skip_rows` and `nrows` applied""" 75 return self._sheet.height 76 77 @property 78 def total_height(self) -> int: 79 """The sheet's total height""" 80 return self._sheet.total_height 81 82 @property 83 def selected_columns(self) -> list[ColumnInfo]: 84 """The sheet's selected columns""" 85 return self._sheet.selected_columns 86 87 def available_columns(self) -> list[ColumnInfo]: 88 """The columns available for the given sheet""" 89 return self._sheet.available_columns() 90 91 @property 92 def specified_dtypes(self) -> DTypeMap | None: 93 """The dtypes specified for the sheet""" 94 return self._sheet.specified_dtypes 95 96 @property 97 def visible(self) -> SheetVisible: 98 """The visibility of the sheet""" 99 return self._sheet.visible 100 101 def to_arrow(self) -> pa.RecordBatch: 102 """Converts the sheet to a pyarrow `RecordBatch`""" 103 return self._sheet.to_arrow() 104 105 def to_pandas(self) -> "pd.DataFrame": 106 """Converts the sheet to a Pandas `DataFrame`. 107 108 Requires the `pandas` extra to be installed. 109 """ 110 # We know for sure that the sheet will yield exactly one RecordBatch 111 return self.to_arrow().to_pandas() 112 113 def to_polars(self) -> "pl.DataFrame": 114 """Converts the sheet to a Polars `DataFrame`. 115 116 Requires the `polars` extra to be installed. 117 """ 118 return _recordbatch_to_polars(self.to_arrow()) 119 120 def __repr__(self) -> str: 121 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
72 @property 73 def height(self) -> int: 74 """The sheet's height, with `skip_rows` and `nrows` applied""" 75 return self._sheet.height
The sheet's height, with skip_rows
and nrows
applied
77 @property 78 def total_height(self) -> int: 79 """The sheet's total height""" 80 return self._sheet.total_height
The sheet's total height
82 @property 83 def selected_columns(self) -> list[ColumnInfo]: 84 """The sheet's selected columns""" 85 return self._sheet.selected_columns
The sheet's selected columns
87 def available_columns(self) -> list[ColumnInfo]: 88 """The columns available for the given sheet""" 89 return self._sheet.available_columns()
The columns available for the given sheet
91 @property 92 def specified_dtypes(self) -> DTypeMap | None: 93 """The dtypes specified for the sheet""" 94 return self._sheet.specified_dtypes
The dtypes specified for the sheet
96 @property 97 def visible(self) -> SheetVisible: 98 """The visibility of the sheet""" 99 return self._sheet.visible
The visibility of the sheet
101 def to_arrow(self) -> pa.RecordBatch: 102 """Converts the sheet to a pyarrow `RecordBatch`""" 103 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
105 def to_pandas(self) -> "pd.DataFrame": 106 """Converts the sheet to a Pandas `DataFrame`. 107 108 Requires the `pandas` extra to be installed. 109 """ 110 # We know for sure that the sheet will yield exactly one RecordBatch 111 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame
.
Requires the pandas
extra to be installed.
113 def to_polars(self) -> "pl.DataFrame": 114 """Converts the sheet to a Polars `DataFrame`. 115 116 Requires the `polars` extra to be installed. 117 """ 118 return _recordbatch_to_polars(self.to_arrow())
Converts the sheet to a Polars DataFrame
.
Requires the polars
extra to be installed.
This class provides information about a single column in a sheet
fastexcel.DTypeFrom
. How the dtype of the column was determined.
One of three possible values:
"provided_by_index"
: The dtype was specified via the column index"provided_by_name"
: The dtype was specified via the column name"guessed"
: The dtype was determined from the content of the column
fastexcel.ColumnNameFrom
. How the name of the column was determined.
One of three possible values:
"provided"
: The column name was provided via theuse_columns
parameter"looked_up"
: The column name was looked up from the data found in the sheet"generated"
: The column name was generated from the column index, either becauseheader_row
wasNone
, or because it could not be looked up
The base class for all fastexcel errors
Data for a given cell cannot be retrieved
calamine returned an error regarding the content of the cell
Generic calamine error
Sheet was not found
Column was not found
Generic arrow error
Provided parameters are invalid
Column contains an unsupported type combination