fastexcel
1from __future__ import annotations 2 3import sys 4import typing 5from typing import TYPE_CHECKING, Callable, Literal 6 7if sys.version_info < (3, 10): 8 from typing_extensions import TypeAlias 9else: 10 from typing import TypeAlias 11 12if TYPE_CHECKING: 13 import pandas as pd 14 import polars as pl 15 16from os.path import expanduser 17from pathlib import Path 18 19import pyarrow as pa 20 21from ._fastexcel import ( 22 ArrowError, 23 CalamineCellError, 24 CalamineError, 25 CannotRetrieveCellDataError, 26 ColumnInfo, 27 ColumnNotFoundError, 28 FastExcelError, 29 InvalidParametersError, 30 SheetNotFoundError, 31 UnsupportedColumnTypeCombinationError, 32 __version__, 33 _ExcelReader, 34 _ExcelSheet, 35 _ExcelTable, 36) 37from ._fastexcel import read_excel as _read_excel 38 39DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 40DTypeMap: TypeAlias = "dict[str | int, DType]" 41ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 42DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"] 43SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 44 45 46def _recordbatch_to_polars(rb: pa.RecordBatch) -> pl.DataFrame: 47 import polars as pl 48 49 df = pl.from_arrow(data=rb) 50 assert isinstance(df, pl.DataFrame) 51 return df 52 53 54class ExcelSheet: 55 """A class representing a single sheet in an Excel File""" 56 57 def __init__(self, sheet: _ExcelSheet) -> None: 58 self._sheet = sheet 59 60 @property 61 def name(self) -> str: 62 """The name of the sheet""" 63 return self._sheet.name 64 65 @property 66 def width(self) -> int: 67 """The sheet's width""" 68 return self._sheet.width 69 70 @property 71 def height(self) -> int: 72 """The sheet's height, with `skip_rows` and `nrows` applied""" 73 return self._sheet.height 74 75 @property 76 def total_height(self) -> int: 77 """The sheet's total height""" 78 return self._sheet.total_height 79 80 @property 81 def selected_columns(self) -> list[ColumnInfo]: 82 """The sheet's selected columns""" 83 return self._sheet.selected_columns 84 85 @property 86 def available_columns(self) -> list[ColumnInfo]: 87 """The columns available for the given sheet""" 88 return self._sheet.available_columns 89 90 @property 91 def specified_dtypes(self) -> DTypeMap | None: 92 """The dtypes specified for the sheet""" 93 return self._sheet.specified_dtypes 94 95 @property 96 def visible(self) -> SheetVisible: 97 """The visibility of the sheet""" 98 return self._sheet.visible 99 100 def to_arrow(self) -> pa.RecordBatch: 101 """Converts the sheet to a pyarrow `RecordBatch`""" 102 return self._sheet.to_arrow() 103 104 def to_pandas(self) -> "pd.DataFrame": 105 """Converts the sheet to a Pandas `DataFrame`. 106 107 Requires the `pandas` extra to be installed. 108 """ 109 # We know for sure that the sheet will yield exactly one RecordBatch 110 return self.to_arrow().to_pandas() 111 112 def to_polars(self) -> "pl.DataFrame": 113 """Converts the sheet to a Polars `DataFrame`. 114 115 Requires the `polars` extra to be installed. 116 """ 117 return _recordbatch_to_polars(self.to_arrow()) 118 119 def __repr__(self) -> str: 120 return self._sheet.__repr__() 121 122 123class ExcelTable: 124 """A class representing a single table in an Excel file""" 125 126 def __init__(self, table: _ExcelTable) -> None: 127 self._table = table 128 129 @property 130 def name(self) -> str: 131 """The name of the table""" 132 return self._table.name 133 134 @property 135 def sheet_name(self) -> str: 136 """The name of the sheet this table belongs to""" 137 return self._table.sheet_name 138 139 @property 140 def width(self) -> int: 141 """The table's width""" 142 return self._table.width 143 144 @property 145 def height(self) -> int: 146 """The table's height""" 147 return self._table.height 148 149 @property 150 def total_height(self) -> int: 151 """The table's total height""" 152 return self._table.total_height 153 154 @property 155 def offset(self) -> int: 156 """The table's offset before data starts""" 157 return self._table.offset 158 159 @property 160 def selected_columns(self) -> list[ColumnInfo]: 161 """The table's selected columns""" 162 return self._table.selected_columns 163 164 @property 165 def available_columns(self) -> list[ColumnInfo]: 166 """The columns available for the given table""" 167 return self._table.available_columns 168 169 @property 170 def specified_dtypes(self) -> DTypeMap | None: 171 """The dtypes specified for the table""" 172 return self._table.specified_dtypes 173 174 def to_arrow(self) -> pa.RecordBatch: 175 """Converts the table to a pyarrow `RecordBatch`""" 176 return self._table.to_arrow() 177 178 def to_pandas(self) -> "pd.DataFrame": 179 """Converts the table to a Pandas `DataFrame`. 180 181 Requires the `pandas` extra to be installed. 182 """ 183 # We know for sure that the table will yield exactly one RecordBatch 184 return self.to_arrow().to_pandas() 185 186 def to_polars(self) -> "pl.DataFrame": 187 """Converts the table to a Polars `DataFrame`. 188 189 Requires the `polars` extra to be installed. 190 """ 191 return _recordbatch_to_polars(self.to_arrow()) 192 193 194class ExcelReader: 195 """A class representing an open Excel file and allowing to read its sheets""" 196 197 def __init__(self, reader: _ExcelReader) -> None: 198 self._reader = reader 199 200 @property 201 def sheet_names(self) -> list[str]: 202 """The list of sheet names""" 203 return self._reader.sheet_names 204 205 def load_sheet( 206 self, 207 idx_or_name: int | str, 208 *, 209 header_row: int | None = 0, 210 column_names: list[str] | None = None, 211 skip_rows: int | None = None, 212 n_rows: int | None = None, 213 schema_sample_rows: int | None = 1_000, 214 dtype_coercion: Literal["coerce", "strict"] = "coerce", 215 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 216 dtypes: DType | DTypeMap | None = None, 217 ) -> ExcelSheet: 218 """Loads a sheet lazily by index or name. 219 220 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 221 :param header_row: The index of the row containing the column labels, default index is 0. 222 If `None`, the sheet does not have any column labels. 223 Any rows before the `header_row` will be automatically skipped. 224 :param column_names: Overrides headers found in the document. 225 If `column_names` is used, `header_row` will be ignored. 226 :param n_rows: Specifies how many rows should be loaded. 227 If `None`, all rows are loaded 228 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 229 Any rows before the `header_row` are automatically skipped. 230 If `header_row` is `None`: 231 - if `skip_rows` is `None` (default): it skips all empty rows 232 at the beginning of the sheet. 233 - if `skip_rows` is a number, it skips the specified number 234 of rows from the start of the sheet. 235 :param schema_sample_rows: Specifies how many rows should be used to determine 236 the dtype of a column. Cannot be 0. A specific dtype can be 237 enforced for some or all columns through the `dtypes` parameter. 238 If `None`, all rows will be used. 239 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 240 will try to coerce different dtypes in a column to the same one, 241 whereas `strict` will raise an error in case a column contains 242 several dtypes. Note that this only applies to columns whose dtype 243 is guessed, i.e. not specified via `dtypes`. 244 :param use_columns: Specifies the columns to use. Can either be: 245 - `None` to select all columns 246 - A list of strings and ints, the column names and/or indices 247 (starting at 0) 248 - A string, a comma separated list of Excel column letters and column 249 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 250 `A,B,C,D,E` and `A,C,E,F`) 251 - A callable, a function that takes a column and returns a boolean 252 indicating whether the column should be used 253 :param dtypes: An optional dtype (for all columns) 254 or dict of dtypes with keys as column indices or names. 255 """ 256 return ExcelSheet( 257 self._reader.load_sheet( 258 idx_or_name=idx_or_name, 259 header_row=header_row, 260 column_names=column_names, 261 skip_rows=skip_rows, 262 n_rows=n_rows, 263 schema_sample_rows=schema_sample_rows, 264 dtype_coercion=dtype_coercion, 265 use_columns=use_columns, 266 dtypes=dtypes, 267 eager=False, 268 ) 269 ) 270 271 def table_names(self, sheet_name: str | None = None) -> list[str]: 272 """The list of table names. 273 274 Will return an empty list if no tables are found. 275 276 :param sheet_name: If given, will limit the list to the given sheet, will be faster 277 too. 278 """ 279 return self._reader.table_names(sheet_name) 280 281 @typing.overload 282 def load_table( 283 self, 284 name: str, 285 *, 286 header_row: int | None = None, 287 column_names: list[str] | None = None, 288 skip_rows: int = 0, 289 n_rows: int | None = None, 290 schema_sample_rows: int | None = 1_000, 291 dtype_coercion: Literal["coerce", "strict"] = "coerce", 292 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 293 dtypes: DType | DTypeMap | None = None, 294 eager: Literal[False] = ..., 295 ) -> ExcelTable: ... 296 @typing.overload 297 def load_table( 298 self, 299 name: str, 300 *, 301 header_row: int | None = None, 302 column_names: list[str] | None = None, 303 skip_rows: int = 0, 304 n_rows: int | None = None, 305 schema_sample_rows: int | None = 1_000, 306 dtype_coercion: Literal["coerce", "strict"] = "coerce", 307 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 308 dtypes: DType | DTypeMap | None = None, 309 eager: Literal[True] = ..., 310 ) -> pa.RecordBatch: ... 311 def load_table( 312 self, 313 name: str, 314 *, 315 header_row: int | None = None, 316 column_names: list[str] | None = None, 317 skip_rows: int = 0, 318 n_rows: int | None = None, 319 schema_sample_rows: int | None = 1_000, 320 dtype_coercion: Literal["coerce", "strict"] = "coerce", 321 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 322 dtypes: DType | DTypeMap | None = None, 323 eager: bool = False, 324 ) -> ExcelTable | pa.RecordBatch: 325 """Loads a table by name. 326 327 :param name: The name of the table to load. 328 :param header_row: The index of the row containing the column labels. 329 If `None`, the table's column names will be used. 330 Any rows before the `header_row` will be automatically skipped. 331 :param column_names: Overrides headers found in the document. 332 If `column_names` is used, `header_row` will be ignored. 333 :param n_rows: Specifies how many rows should be loaded. 334 If `None`, all rows are loaded 335 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 336 Any rows before the `header_row` are automatically skipped. 337 If `header_row` is `None`, it skips the number of rows from the 338 start of the sheet. 339 :param schema_sample_rows: Specifies how many rows should be used to determine 340 the dtype of a column. Cannot be 0. A specific dtype can be 341 enforced for some or all columns through the `dtypes` parameter. 342 If `None`, all rows will be used. 343 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 344 will try to coerce different dtypes in a column to the same one, 345 whereas `strict` will raise an error in case a column contains 346 several dtypes. Note that this only applies to columns whose dtype 347 is guessed, i.e. not specified via `dtypes`. 348 :param use_columns: Specifies the columns to use. Can either be: 349 - `None` to select all columns 350 - A list of strings and ints, the column names and/or indices 351 (starting at 0) 352 - A string, a comma separated list of Excel column letters and column 353 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 354 `A,B,C,D,E` and `A,C,E,F`) 355 - A callable, a function that takes a column and returns a boolean 356 indicating whether the column should be used 357 :param dtypes: An optional dtype (for all columns) 358 or dict of dtypes with keys as column indices or names. 359 """ 360 output = self._reader.load_table( # type:ignore[call-overload,misc] 361 name=name, 362 header_row=header_row, 363 column_names=column_names, 364 skip_rows=skip_rows, 365 n_rows=n_rows, 366 schema_sample_rows=schema_sample_rows, 367 dtype_coercion=dtype_coercion, 368 use_columns=use_columns, 369 dtypes=dtypes, 370 eager=eager, 371 ) 372 if eager: 373 return output 374 return ExcelTable(output) 375 376 def load_sheet_eager( 377 self, 378 idx_or_name: int | str, 379 *, 380 header_row: int | None = 0, 381 column_names: list[str] | None = None, 382 skip_rows: int | None = None, 383 n_rows: int | None = None, 384 schema_sample_rows: int | None = 1_000, 385 dtype_coercion: Literal["coerce", "strict"] = "coerce", 386 use_columns: list[str] | list[int] | str | None = None, 387 dtypes: DType | DTypeMap | None = None, 388 ) -> pa.RecordBatch: 389 """Loads a sheet eagerly by index or name. 390 391 For xlsx files, this will be faster and more memory-efficient, as it will use 392 `worksheet_range_ref` under the hood, which returns borrowed types. 393 394 Refer to `load_sheet` for parameter documentation 395 """ 396 return self._reader.load_sheet( 397 idx_or_name=idx_or_name, 398 header_row=header_row, 399 column_names=column_names, 400 skip_rows=skip_rows, 401 n_rows=n_rows, 402 schema_sample_rows=schema_sample_rows, 403 dtype_coercion=dtype_coercion, 404 use_columns=use_columns, 405 dtypes=dtypes, 406 eager=True, 407 ) 408 409 def load_sheet_by_name( 410 self, 411 name: str, 412 *, 413 header_row: int | None = 0, 414 column_names: list[str] | None = None, 415 skip_rows: int | None = None, 416 n_rows: int | None = None, 417 schema_sample_rows: int | None = 1_000, 418 dtype_coercion: Literal["coerce", "strict"] = "coerce", 419 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 420 dtypes: DType | DTypeMap | None = None, 421 ) -> ExcelSheet: 422 """Loads a sheet by name. 423 424 Refer to `load_sheet` for parameter documentation 425 """ 426 return self.load_sheet( 427 name, 428 header_row=header_row, 429 column_names=column_names, 430 skip_rows=skip_rows, 431 n_rows=n_rows, 432 schema_sample_rows=schema_sample_rows, 433 dtype_coercion=dtype_coercion, 434 use_columns=use_columns, 435 dtypes=dtypes, 436 ) 437 438 def load_sheet_by_idx( 439 self, 440 idx: int, 441 *, 442 header_row: int | None = 0, 443 column_names: list[str] | None = None, 444 skip_rows: int | None = None, 445 n_rows: int | None = None, 446 schema_sample_rows: int | None = 1_000, 447 dtype_coercion: Literal["coerce", "strict"] = "coerce", 448 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 449 dtypes: DType | DTypeMap | None = None, 450 ) -> ExcelSheet: 451 """Loads a sheet by index. 452 453 Refer to `load_sheet` for parameter documentation 454 """ 455 return self.load_sheet( 456 idx, 457 header_row=header_row, 458 column_names=column_names, 459 skip_rows=skip_rows, 460 n_rows=n_rows, 461 schema_sample_rows=schema_sample_rows, 462 dtype_coercion=dtype_coercion, 463 use_columns=use_columns, 464 dtypes=dtypes, 465 ) 466 467 def __repr__(self) -> str: 468 return self._reader.__repr__() 469 470 471def read_excel(source: Path | str | bytes) -> ExcelReader: 472 """Opens and loads an excel file. 473 474 :param source: The path to a file or its content as bytes 475 """ 476 if isinstance(source, (str, Path)): 477 source = expanduser(source) 478 return ExcelReader(_read_excel(source)) 479 480 481__all__ = ( 482 ## version 483 "__version__", 484 ## main entrypoint 485 "read_excel", 486 ## Python types 487 "DType", 488 "DTypeMap", 489 # Excel reader 490 "ExcelReader", 491 # Excel sheet 492 "ExcelSheet", 493 # Column metadata 494 "DTypeFrom", 495 "ColumnNameFrom", 496 "ColumnInfo", 497 # Exceptions 498 "FastExcelError", 499 "CannotRetrieveCellDataError", 500 "CalamineCellError", 501 "CalamineError", 502 "SheetNotFoundError", 503 "ColumnNotFoundError", 504 "ArrowError", 505 "InvalidParametersError", 506 "UnsupportedColumnTypeCombinationError", 507)
472def read_excel(source: Path | str | bytes) -> ExcelReader: 473 """Opens and loads an excel file. 474 475 :param source: The path to a file or its content as bytes 476 """ 477 if isinstance(source, (str, Path)): 478 source = expanduser(source) 479 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
195class ExcelReader: 196 """A class representing an open Excel file and allowing to read its sheets""" 197 198 def __init__(self, reader: _ExcelReader) -> None: 199 self._reader = reader 200 201 @property 202 def sheet_names(self) -> list[str]: 203 """The list of sheet names""" 204 return self._reader.sheet_names 205 206 def load_sheet( 207 self, 208 idx_or_name: int | str, 209 *, 210 header_row: int | None = 0, 211 column_names: list[str] | None = None, 212 skip_rows: int | None = None, 213 n_rows: int | None = None, 214 schema_sample_rows: int | None = 1_000, 215 dtype_coercion: Literal["coerce", "strict"] = "coerce", 216 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 217 dtypes: DType | DTypeMap | None = None, 218 ) -> ExcelSheet: 219 """Loads a sheet lazily by index or name. 220 221 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 222 :param header_row: The index of the row containing the column labels, default index is 0. 223 If `None`, the sheet does not have any column labels. 224 Any rows before the `header_row` will be automatically skipped. 225 :param column_names: Overrides headers found in the document. 226 If `column_names` is used, `header_row` will be ignored. 227 :param n_rows: Specifies how many rows should be loaded. 228 If `None`, all rows are loaded 229 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 230 Any rows before the `header_row` are automatically skipped. 231 If `header_row` is `None`: 232 - if `skip_rows` is `None` (default): it skips all empty rows 233 at the beginning of the sheet. 234 - if `skip_rows` is a number, it skips the specified number 235 of rows from the start of the sheet. 236 :param schema_sample_rows: Specifies how many rows should be used to determine 237 the dtype of a column. Cannot be 0. A specific dtype can be 238 enforced for some or all columns through the `dtypes` parameter. 239 If `None`, all rows will be used. 240 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 241 will try to coerce different dtypes in a column to the same one, 242 whereas `strict` will raise an error in case a column contains 243 several dtypes. Note that this only applies to columns whose dtype 244 is guessed, i.e. not specified via `dtypes`. 245 :param use_columns: Specifies the columns to use. Can either be: 246 - `None` to select all columns 247 - A list of strings and ints, the column names and/or indices 248 (starting at 0) 249 - A string, a comma separated list of Excel column letters and column 250 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 251 `A,B,C,D,E` and `A,C,E,F`) 252 - A callable, a function that takes a column and returns a boolean 253 indicating whether the column should be used 254 :param dtypes: An optional dtype (for all columns) 255 or dict of dtypes with keys as column indices or names. 256 """ 257 return ExcelSheet( 258 self._reader.load_sheet( 259 idx_or_name=idx_or_name, 260 header_row=header_row, 261 column_names=column_names, 262 skip_rows=skip_rows, 263 n_rows=n_rows, 264 schema_sample_rows=schema_sample_rows, 265 dtype_coercion=dtype_coercion, 266 use_columns=use_columns, 267 dtypes=dtypes, 268 eager=False, 269 ) 270 ) 271 272 def table_names(self, sheet_name: str | None = None) -> list[str]: 273 """The list of table names. 274 275 Will return an empty list if no tables are found. 276 277 :param sheet_name: If given, will limit the list to the given sheet, will be faster 278 too. 279 """ 280 return self._reader.table_names(sheet_name) 281 282 @typing.overload 283 def load_table( 284 self, 285 name: str, 286 *, 287 header_row: int | None = None, 288 column_names: list[str] | None = None, 289 skip_rows: int = 0, 290 n_rows: int | None = None, 291 schema_sample_rows: int | None = 1_000, 292 dtype_coercion: Literal["coerce", "strict"] = "coerce", 293 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 294 dtypes: DType | DTypeMap | None = None, 295 eager: Literal[False] = ..., 296 ) -> ExcelTable: ... 297 @typing.overload 298 def load_table( 299 self, 300 name: str, 301 *, 302 header_row: int | None = None, 303 column_names: list[str] | None = None, 304 skip_rows: int = 0, 305 n_rows: int | None = None, 306 schema_sample_rows: int | None = 1_000, 307 dtype_coercion: Literal["coerce", "strict"] = "coerce", 308 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 309 dtypes: DType | DTypeMap | None = None, 310 eager: Literal[True] = ..., 311 ) -> pa.RecordBatch: ... 312 def load_table( 313 self, 314 name: str, 315 *, 316 header_row: int | None = None, 317 column_names: list[str] | None = None, 318 skip_rows: int = 0, 319 n_rows: int | None = None, 320 schema_sample_rows: int | None = 1_000, 321 dtype_coercion: Literal["coerce", "strict"] = "coerce", 322 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 323 dtypes: DType | DTypeMap | None = None, 324 eager: bool = False, 325 ) -> ExcelTable | pa.RecordBatch: 326 """Loads a table by name. 327 328 :param name: The name of the table to load. 329 :param header_row: The index of the row containing the column labels. 330 If `None`, the table's column names will be used. 331 Any rows before the `header_row` will be automatically skipped. 332 :param column_names: Overrides headers found in the document. 333 If `column_names` is used, `header_row` will be ignored. 334 :param n_rows: Specifies how many rows should be loaded. 335 If `None`, all rows are loaded 336 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 337 Any rows before the `header_row` are automatically skipped. 338 If `header_row` is `None`, it skips the number of rows from the 339 start of the sheet. 340 :param schema_sample_rows: Specifies how many rows should be used to determine 341 the dtype of a column. Cannot be 0. A specific dtype can be 342 enforced for some or all columns through the `dtypes` parameter. 343 If `None`, all rows will be used. 344 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 345 will try to coerce different dtypes in a column to the same one, 346 whereas `strict` will raise an error in case a column contains 347 several dtypes. Note that this only applies to columns whose dtype 348 is guessed, i.e. not specified via `dtypes`. 349 :param use_columns: Specifies the columns to use. Can either be: 350 - `None` to select all columns 351 - A list of strings and ints, the column names and/or indices 352 (starting at 0) 353 - A string, a comma separated list of Excel column letters and column 354 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 355 `A,B,C,D,E` and `A,C,E,F`) 356 - A callable, a function that takes a column and returns a boolean 357 indicating whether the column should be used 358 :param dtypes: An optional dtype (for all columns) 359 or dict of dtypes with keys as column indices or names. 360 """ 361 output = self._reader.load_table( # type:ignore[call-overload,misc] 362 name=name, 363 header_row=header_row, 364 column_names=column_names, 365 skip_rows=skip_rows, 366 n_rows=n_rows, 367 schema_sample_rows=schema_sample_rows, 368 dtype_coercion=dtype_coercion, 369 use_columns=use_columns, 370 dtypes=dtypes, 371 eager=eager, 372 ) 373 if eager: 374 return output 375 return ExcelTable(output) 376 377 def load_sheet_eager( 378 self, 379 idx_or_name: int | str, 380 *, 381 header_row: int | None = 0, 382 column_names: list[str] | None = None, 383 skip_rows: int | None = None, 384 n_rows: int | None = None, 385 schema_sample_rows: int | None = 1_000, 386 dtype_coercion: Literal["coerce", "strict"] = "coerce", 387 use_columns: list[str] | list[int] | str | None = None, 388 dtypes: DType | DTypeMap | None = None, 389 ) -> pa.RecordBatch: 390 """Loads a sheet eagerly by index or name. 391 392 For xlsx files, this will be faster and more memory-efficient, as it will use 393 `worksheet_range_ref` under the hood, which returns borrowed types. 394 395 Refer to `load_sheet` for parameter documentation 396 """ 397 return self._reader.load_sheet( 398 idx_or_name=idx_or_name, 399 header_row=header_row, 400 column_names=column_names, 401 skip_rows=skip_rows, 402 n_rows=n_rows, 403 schema_sample_rows=schema_sample_rows, 404 dtype_coercion=dtype_coercion, 405 use_columns=use_columns, 406 dtypes=dtypes, 407 eager=True, 408 ) 409 410 def load_sheet_by_name( 411 self, 412 name: str, 413 *, 414 header_row: int | None = 0, 415 column_names: list[str] | None = None, 416 skip_rows: int | None = None, 417 n_rows: int | None = None, 418 schema_sample_rows: int | None = 1_000, 419 dtype_coercion: Literal["coerce", "strict"] = "coerce", 420 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 421 dtypes: DType | DTypeMap | None = None, 422 ) -> ExcelSheet: 423 """Loads a sheet by name. 424 425 Refer to `load_sheet` for parameter documentation 426 """ 427 return self.load_sheet( 428 name, 429 header_row=header_row, 430 column_names=column_names, 431 skip_rows=skip_rows, 432 n_rows=n_rows, 433 schema_sample_rows=schema_sample_rows, 434 dtype_coercion=dtype_coercion, 435 use_columns=use_columns, 436 dtypes=dtypes, 437 ) 438 439 def load_sheet_by_idx( 440 self, 441 idx: int, 442 *, 443 header_row: int | None = 0, 444 column_names: list[str] | None = None, 445 skip_rows: int | None = None, 446 n_rows: int | None = None, 447 schema_sample_rows: int | None = 1_000, 448 dtype_coercion: Literal["coerce", "strict"] = "coerce", 449 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 450 dtypes: DType | DTypeMap | None = None, 451 ) -> ExcelSheet: 452 """Loads a sheet by index. 453 454 Refer to `load_sheet` for parameter documentation 455 """ 456 return self.load_sheet( 457 idx, 458 header_row=header_row, 459 column_names=column_names, 460 skip_rows=skip_rows, 461 n_rows=n_rows, 462 schema_sample_rows=schema_sample_rows, 463 dtype_coercion=dtype_coercion, 464 use_columns=use_columns, 465 dtypes=dtypes, 466 ) 467 468 def __repr__(self) -> str: 469 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
201 @property 202 def sheet_names(self) -> list[str]: 203 """The list of sheet names""" 204 return self._reader.sheet_names
The list of sheet names
206 def load_sheet( 207 self, 208 idx_or_name: int | str, 209 *, 210 header_row: int | None = 0, 211 column_names: list[str] | None = None, 212 skip_rows: int | None = None, 213 n_rows: int | None = None, 214 schema_sample_rows: int | None = 1_000, 215 dtype_coercion: Literal["coerce", "strict"] = "coerce", 216 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 217 dtypes: DType | DTypeMap | None = None, 218 ) -> ExcelSheet: 219 """Loads a sheet lazily by index or name. 220 221 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 222 :param header_row: The index of the row containing the column labels, default index is 0. 223 If `None`, the sheet does not have any column labels. 224 Any rows before the `header_row` will be automatically skipped. 225 :param column_names: Overrides headers found in the document. 226 If `column_names` is used, `header_row` will be ignored. 227 :param n_rows: Specifies how many rows should be loaded. 228 If `None`, all rows are loaded 229 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 230 Any rows before the `header_row` are automatically skipped. 231 If `header_row` is `None`: 232 - if `skip_rows` is `None` (default): it skips all empty rows 233 at the beginning of the sheet. 234 - if `skip_rows` is a number, it skips the specified number 235 of rows from the start of the sheet. 236 :param schema_sample_rows: Specifies how many rows should be used to determine 237 the dtype of a column. Cannot be 0. A specific dtype can be 238 enforced for some or all columns through the `dtypes` parameter. 239 If `None`, all rows will be used. 240 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 241 will try to coerce different dtypes in a column to the same one, 242 whereas `strict` will raise an error in case a column contains 243 several dtypes. Note that this only applies to columns whose dtype 244 is guessed, i.e. not specified via `dtypes`. 245 :param use_columns: Specifies the columns to use. Can either be: 246 - `None` to select all columns 247 - A list of strings and ints, the column names and/or indices 248 (starting at 0) 249 - A string, a comma separated list of Excel column letters and column 250 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 251 `A,B,C,D,E` and `A,C,E,F`) 252 - A callable, a function that takes a column and returns a boolean 253 indicating whether the column should be used 254 :param dtypes: An optional dtype (for all columns) 255 or dict of dtypes with keys as column indices or names. 256 """ 257 return ExcelSheet( 258 self._reader.load_sheet( 259 idx_or_name=idx_or_name, 260 header_row=header_row, 261 column_names=column_names, 262 skip_rows=skip_rows, 263 n_rows=n_rows, 264 schema_sample_rows=schema_sample_rows, 265 dtype_coercion=dtype_coercion, 266 use_columns=use_columns, 267 dtypes=dtypes, 268 eager=False, 269 ) 270 )
Loads a sheet lazily by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None
, the sheet does not have any column labels. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
:- if
skip_rows
isNone
(default): it skips all empty rows at the beginning of the sheet. - if
skip_rows
is a number, it skips the specified number of rows from the start of the sheet.
- if
- schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
272 def table_names(self, sheet_name: str | None = None) -> list[str]: 273 """The list of table names. 274 275 Will return an empty list if no tables are found. 276 277 :param sheet_name: If given, will limit the list to the given sheet, will be faster 278 too. 279 """ 280 return self._reader.table_names(sheet_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_name: If given, will limit the list to the given sheet, will be faster too.
312 def load_table( 313 self, 314 name: str, 315 *, 316 header_row: int | None = None, 317 column_names: list[str] | None = None, 318 skip_rows: int = 0, 319 n_rows: int | None = None, 320 schema_sample_rows: int | None = 1_000, 321 dtype_coercion: Literal["coerce", "strict"] = "coerce", 322 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 323 dtypes: DType | DTypeMap | None = None, 324 eager: bool = False, 325 ) -> ExcelTable | pa.RecordBatch: 326 """Loads a table by name. 327 328 :param name: The name of the table to load. 329 :param header_row: The index of the row containing the column labels. 330 If `None`, the table's column names will be used. 331 Any rows before the `header_row` will be automatically skipped. 332 :param column_names: Overrides headers found in the document. 333 If `column_names` is used, `header_row` will be ignored. 334 :param n_rows: Specifies how many rows should be loaded. 335 If `None`, all rows are loaded 336 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 337 Any rows before the `header_row` are automatically skipped. 338 If `header_row` is `None`, it skips the number of rows from the 339 start of the sheet. 340 :param schema_sample_rows: Specifies how many rows should be used to determine 341 the dtype of a column. Cannot be 0. A specific dtype can be 342 enforced for some or all columns through the `dtypes` parameter. 343 If `None`, all rows will be used. 344 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 345 will try to coerce different dtypes in a column to the same one, 346 whereas `strict` will raise an error in case a column contains 347 several dtypes. Note that this only applies to columns whose dtype 348 is guessed, i.e. not specified via `dtypes`. 349 :param use_columns: Specifies the columns to use. Can either be: 350 - `None` to select all columns 351 - A list of strings and ints, the column names and/or indices 352 (starting at 0) 353 - A string, a comma separated list of Excel column letters and column 354 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 355 `A,B,C,D,E` and `A,C,E,F`) 356 - A callable, a function that takes a column and returns a boolean 357 indicating whether the column should be used 358 :param dtypes: An optional dtype (for all columns) 359 or dict of dtypes with keys as column indices or names. 360 """ 361 output = self._reader.load_table( # type:ignore[call-overload,misc] 362 name=name, 363 header_row=header_row, 364 column_names=column_names, 365 skip_rows=skip_rows, 366 n_rows=n_rows, 367 schema_sample_rows=schema_sample_rows, 368 dtype_coercion=dtype_coercion, 369 use_columns=use_columns, 370 dtypes=dtypes, 371 eager=eager, 372 ) 373 if eager: 374 return output 375 return ExcelTable(output)
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None
, the table's column names will be used. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the
dtypes
parameter. IfNone
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names.
377 def load_sheet_eager( 378 self, 379 idx_or_name: int | str, 380 *, 381 header_row: int | None = 0, 382 column_names: list[str] | None = None, 383 skip_rows: int | None = None, 384 n_rows: int | None = None, 385 schema_sample_rows: int | None = 1_000, 386 dtype_coercion: Literal["coerce", "strict"] = "coerce", 387 use_columns: list[str] | list[int] | str | None = None, 388 dtypes: DType | DTypeMap | None = None, 389 ) -> pa.RecordBatch: 390 """Loads a sheet eagerly by index or name. 391 392 For xlsx files, this will be faster and more memory-efficient, as it will use 393 `worksheet_range_ref` under the hood, which returns borrowed types. 394 395 Refer to `load_sheet` for parameter documentation 396 """ 397 return self._reader.load_sheet( 398 idx_or_name=idx_or_name, 399 header_row=header_row, 400 column_names=column_names, 401 skip_rows=skip_rows, 402 n_rows=n_rows, 403 schema_sample_rows=schema_sample_rows, 404 dtype_coercion=dtype_coercion, 405 use_columns=use_columns, 406 dtypes=dtypes, 407 eager=True, 408 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref
under the hood, which returns borrowed types.
Refer to load_sheet
for parameter documentation
410 def load_sheet_by_name( 411 self, 412 name: str, 413 *, 414 header_row: int | None = 0, 415 column_names: list[str] | None = None, 416 skip_rows: int | None = None, 417 n_rows: int | None = None, 418 schema_sample_rows: int | None = 1_000, 419 dtype_coercion: Literal["coerce", "strict"] = "coerce", 420 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 421 dtypes: DType | DTypeMap | None = None, 422 ) -> ExcelSheet: 423 """Loads a sheet by name. 424 425 Refer to `load_sheet` for parameter documentation 426 """ 427 return self.load_sheet( 428 name, 429 header_row=header_row, 430 column_names=column_names, 431 skip_rows=skip_rows, 432 n_rows=n_rows, 433 schema_sample_rows=schema_sample_rows, 434 dtype_coercion=dtype_coercion, 435 use_columns=use_columns, 436 dtypes=dtypes, 437 )
Loads a sheet by name.
Refer to load_sheet
for parameter documentation
439 def load_sheet_by_idx( 440 self, 441 idx: int, 442 *, 443 header_row: int | None = 0, 444 column_names: list[str] | None = None, 445 skip_rows: int | None = None, 446 n_rows: int | None = None, 447 schema_sample_rows: int | None = 1_000, 448 dtype_coercion: Literal["coerce", "strict"] = "coerce", 449 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 450 dtypes: DType | DTypeMap | None = None, 451 ) -> ExcelSheet: 452 """Loads a sheet by index. 453 454 Refer to `load_sheet` for parameter documentation 455 """ 456 return self.load_sheet( 457 idx, 458 header_row=header_row, 459 column_names=column_names, 460 skip_rows=skip_rows, 461 n_rows=n_rows, 462 schema_sample_rows=schema_sample_rows, 463 dtype_coercion=dtype_coercion, 464 use_columns=use_columns, 465 dtypes=dtypes, 466 )
Loads a sheet by index.
Refer to load_sheet
for parameter documentation
55class ExcelSheet: 56 """A class representing a single sheet in an Excel File""" 57 58 def __init__(self, sheet: _ExcelSheet) -> None: 59 self._sheet = sheet 60 61 @property 62 def name(self) -> str: 63 """The name of the sheet""" 64 return self._sheet.name 65 66 @property 67 def width(self) -> int: 68 """The sheet's width""" 69 return self._sheet.width 70 71 @property 72 def height(self) -> int: 73 """The sheet's height, with `skip_rows` and `nrows` applied""" 74 return self._sheet.height 75 76 @property 77 def total_height(self) -> int: 78 """The sheet's total height""" 79 return self._sheet.total_height 80 81 @property 82 def selected_columns(self) -> list[ColumnInfo]: 83 """The sheet's selected columns""" 84 return self._sheet.selected_columns 85 86 @property 87 def available_columns(self) -> list[ColumnInfo]: 88 """The columns available for the given sheet""" 89 return self._sheet.available_columns 90 91 @property 92 def specified_dtypes(self) -> DTypeMap | None: 93 """The dtypes specified for the sheet""" 94 return self._sheet.specified_dtypes 95 96 @property 97 def visible(self) -> SheetVisible: 98 """The visibility of the sheet""" 99 return self._sheet.visible 100 101 def to_arrow(self) -> pa.RecordBatch: 102 """Converts the sheet to a pyarrow `RecordBatch`""" 103 return self._sheet.to_arrow() 104 105 def to_pandas(self) -> "pd.DataFrame": 106 """Converts the sheet to a Pandas `DataFrame`. 107 108 Requires the `pandas` extra to be installed. 109 """ 110 # We know for sure that the sheet will yield exactly one RecordBatch 111 return self.to_arrow().to_pandas() 112 113 def to_polars(self) -> "pl.DataFrame": 114 """Converts the sheet to a Polars `DataFrame`. 115 116 Requires the `polars` extra to be installed. 117 """ 118 return _recordbatch_to_polars(self.to_arrow()) 119 120 def __repr__(self) -> str: 121 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
71 @property 72 def height(self) -> int: 73 """The sheet's height, with `skip_rows` and `nrows` applied""" 74 return self._sheet.height
The sheet's height, with skip_rows
and nrows
applied
76 @property 77 def total_height(self) -> int: 78 """The sheet's total height""" 79 return self._sheet.total_height
The sheet's total height
81 @property 82 def selected_columns(self) -> list[ColumnInfo]: 83 """The sheet's selected columns""" 84 return self._sheet.selected_columns
The sheet's selected columns
86 @property 87 def available_columns(self) -> list[ColumnInfo]: 88 """The columns available for the given sheet""" 89 return self._sheet.available_columns
The columns available for the given sheet
91 @property 92 def specified_dtypes(self) -> DTypeMap | None: 93 """The dtypes specified for the sheet""" 94 return self._sheet.specified_dtypes
The dtypes specified for the sheet
96 @property 97 def visible(self) -> SheetVisible: 98 """The visibility of the sheet""" 99 return self._sheet.visible
The visibility of the sheet
101 def to_arrow(self) -> pa.RecordBatch: 102 """Converts the sheet to a pyarrow `RecordBatch`""" 103 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
105 def to_pandas(self) -> "pd.DataFrame": 106 """Converts the sheet to a Pandas `DataFrame`. 107 108 Requires the `pandas` extra to be installed. 109 """ 110 # We know for sure that the sheet will yield exactly one RecordBatch 111 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame
.
Requires the pandas
extra to be installed.
113 def to_polars(self) -> "pl.DataFrame": 114 """Converts the sheet to a Polars `DataFrame`. 115 116 Requires the `polars` extra to be installed. 117 """ 118 return _recordbatch_to_polars(self.to_arrow())
Converts the sheet to a Polars DataFrame
.
Requires the polars
extra to be installed.
This class provides information about a single column in a sheet
fastexcel.ColumnNameFrom
. How the name of the column was determined.
One of three possible values:
"provided"
: The column name was provided via theuse_columns
parameter"looked_up"
: The column name was looked up from the data found in the sheet"generated"
: The column name was generated from the column index, either becauseheader_row
wasNone
, or because it could not be looked up
fastexcel.DTypeFrom
. How the dtype of the column was determined.
One of three possible values:
"provided_by_index"
: The dtype was specified via the column index"provided_by_name"
: The dtype was specified via the column name"guessed"
: The dtype was determined from the content of the column
The base class for all fastexcel errors
Data for a given cell cannot be retrieved
calamine returned an error regarding the content of the cell
Generic calamine error
Sheet was not found
Column was not found
Generic arrow error
Provided parameters are invalid
Column contains an unsupported type combination