fastexcel
1from __future__ import annotations 2 3import sys 4import typing 5from typing import TYPE_CHECKING, Callable, Literal 6 7if sys.version_info < (3, 10): 8 from typing_extensions import TypeAlias 9else: 10 from typing import TypeAlias 11 12if TYPE_CHECKING: 13 import pandas as pd 14 import polars as pl 15 16from os.path import expanduser 17from pathlib import Path 18 19import pyarrow as pa 20 21from ._fastexcel import ( 22 ArrowError, 23 CalamineCellError, 24 CalamineError, 25 CannotRetrieveCellDataError, 26 ColumnInfo, 27 ColumnNotFoundError, 28 FastExcelError, 29 InvalidParametersError, 30 SheetNotFoundError, 31 UnsupportedColumnTypeCombinationError, 32 __version__, 33 _ExcelReader, 34 _ExcelSheet, 35 _ExcelTable, 36) 37from ._fastexcel import read_excel as _read_excel 38 39DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] 40DTypeMap: TypeAlias = "dict[str | int, DType]" 41ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] 42DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"] 43SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] 44 45 46def _recordbatch_to_polars(rb: pa.RecordBatch) -> pl.DataFrame: 47 import polars as pl 48 49 df = pl.from_arrow(data=rb) 50 assert isinstance(df, pl.DataFrame) 51 return df 52 53 54class ExcelSheet: 55 """A class representing a single sheet in an Excel File""" 56 57 def __init__(self, sheet: _ExcelSheet) -> None: 58 self._sheet = sheet 59 60 @property 61 def name(self) -> str: 62 """The name of the sheet""" 63 return self._sheet.name 64 65 @property 66 def width(self) -> int: 67 """The sheet's width""" 68 return self._sheet.width 69 70 @property 71 def height(self) -> int: 72 """The sheet's height, with `skip_rows` and `nrows` applied""" 73 return self._sheet.height 74 75 @property 76 def total_height(self) -> int: 77 """The sheet's total height""" 78 return self._sheet.total_height 79 80 @property 81 def selected_columns(self) -> list[ColumnInfo]: 82 """The sheet's selected columns""" 83 return self._sheet.selected_columns 84 85 @property 86 def available_columns(self) -> list[ColumnInfo]: 87 """The columns available for the given sheet""" 88 return self._sheet.available_columns 89 90 @property 91 def specified_dtypes(self) -> DTypeMap | None: 92 """The dtypes specified for the sheet""" 93 return self._sheet.specified_dtypes 94 95 @property 96 def visible(self) -> SheetVisible: 97 """The visibility of the sheet""" 98 return self._sheet.visible 99 100 def to_arrow(self) -> pa.RecordBatch: 101 """Converts the sheet to a pyarrow `RecordBatch`""" 102 return self._sheet.to_arrow() 103 104 def to_pandas(self) -> "pd.DataFrame": 105 """Converts the sheet to a Pandas `DataFrame`. 106 107 Requires the `pandas` extra to be installed. 108 """ 109 # We know for sure that the sheet will yield exactly one RecordBatch 110 return self.to_arrow().to_pandas() 111 112 def to_polars(self) -> "pl.DataFrame": 113 """Converts the sheet to a Polars `DataFrame`. 114 115 Requires the `polars` extra to be installed. 116 """ 117 return _recordbatch_to_polars(self.to_arrow()) 118 119 def __repr__(self) -> str: 120 return self._sheet.__repr__() 121 122 123class ExcelTable: 124 """A class representing a single table in an Excel file""" 125 126 def __init__(self, table: _ExcelTable) -> None: 127 self._table = table 128 129 @property 130 def name(self) -> str: 131 """The name of the table""" 132 return self._table.name 133 134 @property 135 def sheet_name(self) -> str: 136 """The name of the sheet this table belongs to""" 137 return self._table.sheet_name 138 139 @property 140 def width(self) -> int: 141 """The table's width""" 142 return self._table.width 143 144 @property 145 def height(self) -> int: 146 """The table's height""" 147 return self._table.height 148 149 @property 150 def total_height(self) -> int: 151 """The table's total height""" 152 return self._table.total_height 153 154 @property 155 def offset(self) -> int: 156 """The table's offset before data starts""" 157 return self._table.offset 158 159 @property 160 def selected_columns(self) -> list[ColumnInfo]: 161 """The table's selected columns""" 162 return self._table.selected_columns 163 164 @property 165 def available_columns(self) -> list[ColumnInfo]: 166 """The columns available for the given table""" 167 return self._table.available_columns 168 169 @property 170 def specified_dtypes(self) -> DTypeMap | None: 171 """The dtypes specified for the table""" 172 return self._table.specified_dtypes 173 174 def to_arrow(self) -> pa.RecordBatch: 175 """Converts the table to a pyarrow `RecordBatch`""" 176 return self._table.to_arrow() 177 178 def to_pandas(self) -> "pd.DataFrame": 179 """Converts the table to a Pandas `DataFrame`. 180 181 Requires the `pandas` extra to be installed. 182 """ 183 # We know for sure that the table will yield exactly one RecordBatch 184 return self.to_arrow().to_pandas() 185 186 def to_polars(self) -> "pl.DataFrame": 187 """Converts the table to a Polars `DataFrame`. 188 189 Requires the `polars` extra to be installed. 190 """ 191 return _recordbatch_to_polars(self.to_arrow()) 192 193 194class ExcelReader: 195 """A class representing an open Excel file and allowing to read its sheets""" 196 197 def __init__(self, reader: _ExcelReader) -> None: 198 self._reader = reader 199 200 @property 201 def sheet_names(self) -> list[str]: 202 """The list of sheet names""" 203 return self._reader.sheet_names 204 205 def load_sheet( 206 self, 207 idx_or_name: int | str, 208 *, 209 header_row: int | None = 0, 210 column_names: list[str] | None = None, 211 skip_rows: int = 0, 212 n_rows: int | None = None, 213 schema_sample_rows: int | None = 1_000, 214 dtype_coercion: Literal["coerce", "strict"] = "coerce", 215 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 216 dtypes: DTypeMap | None = None, 217 ) -> ExcelSheet: 218 """Loads a sheet lazily by index or name. 219 220 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 221 :param header_row: The index of the row containing the column labels, default index is 0. 222 If `None`, the sheet does not have any column labels. 223 Any rows before the `header_row` will be automatically skipped. 224 :param column_names: Overrides headers found in the document. 225 If `column_names` is used, `header_row` will be ignored. 226 :param n_rows: Specifies how many rows should be loaded. 227 If `None`, all rows are loaded 228 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 229 Any rows before the `header_row` are automatically skipped. 230 If `header_row` is `None`, it skips the number of rows from the 231 start of the sheet. 232 :param schema_sample_rows: Specifies how many rows should be used to determine 233 the dtype of a column. 234 If `None`, all rows will be used. 235 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 236 will try to coerce different dtypes in a column to the same one, 237 whereas `strict` will raise an error in case a column contains 238 several dtypes. Note that this only applies to columns whose dtype 239 is guessed, i.e. not specified via `dtypes`. 240 :param use_columns: Specifies the columns to use. Can either be: 241 - `None` to select all columns 242 - A list of strings and ints, the column names and/or indices 243 (starting at 0) 244 - A string, a comma separated list of Excel column letters and column 245 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 246 `A,B,C,D,E` and `A,C,E,F`) 247 - A callable, a function that takes a column and returns a boolean 248 indicating whether the column should be used 249 :param dtypes: An optional dict of dtypes. Keys can be column indices or names 250 """ 251 return ExcelSheet( 252 self._reader.load_sheet( 253 idx_or_name=idx_or_name, 254 header_row=header_row, 255 column_names=column_names, 256 skip_rows=skip_rows, 257 n_rows=n_rows, 258 schema_sample_rows=schema_sample_rows, 259 dtype_coercion=dtype_coercion, 260 use_columns=use_columns, 261 dtypes=dtypes, 262 eager=False, 263 ) 264 ) 265 266 def table_names(self, sheet_idx_or_name: str | int | None = None) -> list[str]: 267 """The list of table names. 268 269 Will return an empty list if no tables are found. 270 271 :param sheet_idx_or_name: If given, will limit the list to the given sheet, will be faster 272 too. 273 """ 274 return self._reader.table_names(sheet_idx_or_name) 275 276 @typing.overload 277 def load_table( 278 self, 279 name: str, 280 *, 281 header_row: int | None = None, 282 column_names: list[str] | None = None, 283 skip_rows: int = 0, 284 n_rows: int | None = None, 285 schema_sample_rows: int | None = 1_000, 286 dtype_coercion: Literal["coerce", "strict"] = "coerce", 287 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 288 dtypes: DTypeMap | None = None, 289 eager: Literal[False] = ..., 290 ) -> ExcelTable: ... 291 @typing.overload 292 def load_table( 293 self, 294 name: str, 295 *, 296 header_row: int | None = None, 297 column_names: list[str] | None = None, 298 skip_rows: int = 0, 299 n_rows: int | None = None, 300 schema_sample_rows: int | None = 1_000, 301 dtype_coercion: Literal["coerce", "strict"] = "coerce", 302 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 303 dtypes: DTypeMap | None = None, 304 eager: Literal[True] = ..., 305 ) -> pa.RecordBatch: ... 306 def load_table( 307 self, 308 name: str, 309 *, 310 header_row: int | None = None, 311 column_names: list[str] | None = None, 312 skip_rows: int = 0, 313 n_rows: int | None = None, 314 schema_sample_rows: int | None = 1_000, 315 dtype_coercion: Literal["coerce", "strict"] = "coerce", 316 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 317 dtypes: DTypeMap | None = None, 318 eager: bool = False, 319 ) -> ExcelTable | pa.RecordBatch: 320 """Loads a table by name. 321 322 :param name: The name of the table to load. 323 :param header_row: The index of the row containing the column labels. 324 If `None`, the table's column names will be used. 325 Any rows before the `header_row` will be automatically skipped. 326 :param column_names: Overrides headers found in the document. 327 If `column_names` is used, `header_row` will be ignored. 328 :param n_rows: Specifies how many rows should be loaded. 329 If `None`, all rows are loaded 330 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 331 Any rows before the `header_row` are automatically skipped. 332 If `header_row` is `None`, it skips the number of rows from the 333 start of the sheet. 334 :param schema_sample_rows: Specifies how many rows should be used to determine 335 the dtype of a column. 336 If `None`, all rows will be used. 337 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 338 will try to coerce different dtypes in a column to the same one, 339 whereas `strict` will raise an error in case a column contains 340 several dtypes. Note that this only applies to columns whose dtype 341 is guessed, i.e. not specified via `dtypes`. 342 :param use_columns: Specifies the columns to use. Can either be: 343 - `None` to select all columns 344 - A list of strings and ints, the column names and/or indices 345 (starting at 0) 346 - A string, a comma separated list of Excel column letters and column 347 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 348 `A,B,C,D,E` and `A,C,E,F`) 349 - A callable, a function that takes a column and returns a boolean 350 indicating whether the column should be used 351 :param dtypes: An optional dict of dtypes. Keys can be column indices or names 352 """ 353 output = self._reader.load_table( # type:ignore[call-overload,misc] 354 name=name, 355 header_row=header_row, 356 column_names=column_names, 357 skip_rows=skip_rows, 358 n_rows=n_rows, 359 schema_sample_rows=schema_sample_rows, 360 dtype_coercion=dtype_coercion, 361 use_columns=use_columns, 362 dtypes=dtypes, 363 eager=eager, 364 ) 365 if eager: 366 return output 367 return ExcelTable(output) 368 369 def load_sheet_eager( 370 self, 371 idx_or_name: int | str, 372 *, 373 header_row: int | None = 0, 374 column_names: list[str] | None = None, 375 skip_rows: int = 0, 376 n_rows: int | None = None, 377 schema_sample_rows: int | None = 1_000, 378 dtype_coercion: Literal["coerce", "strict"] = "coerce", 379 use_columns: list[str] | list[int] | str | None = None, 380 dtypes: DTypeMap | None = None, 381 ) -> pa.RecordBatch: 382 """Loads a sheet eagerly by index or name. 383 384 For xlsx files, this will be faster and more memory-efficient, as it will use 385 `worksheet_range_ref` under the hood, which returns borrowed types. 386 387 Refer to `load_sheet` for parameter documentation 388 """ 389 return self._reader.load_sheet( 390 idx_or_name=idx_or_name, 391 header_row=header_row, 392 column_names=column_names, 393 skip_rows=skip_rows, 394 n_rows=n_rows, 395 schema_sample_rows=schema_sample_rows, 396 dtype_coercion=dtype_coercion, 397 use_columns=use_columns, 398 dtypes=dtypes, 399 eager=True, 400 ) 401 402 def load_sheet_by_name( 403 self, 404 name: str, 405 *, 406 header_row: int | None = 0, 407 column_names: list[str] | None = None, 408 skip_rows: int = 0, 409 n_rows: int | None = None, 410 schema_sample_rows: int | None = 1_000, 411 dtype_coercion: Literal["coerce", "strict"] = "coerce", 412 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 413 dtypes: DTypeMap | None = None, 414 ) -> ExcelSheet: 415 """Loads a sheet by name. 416 417 Refer to `load_sheet` for parameter documentation 418 """ 419 return self.load_sheet( 420 name, 421 header_row=header_row, 422 column_names=column_names, 423 skip_rows=skip_rows, 424 n_rows=n_rows, 425 schema_sample_rows=schema_sample_rows, 426 dtype_coercion=dtype_coercion, 427 use_columns=use_columns, 428 dtypes=dtypes, 429 ) 430 431 def load_sheet_by_idx( 432 self, 433 idx: int, 434 *, 435 header_row: int | None = 0, 436 column_names: list[str] | None = None, 437 skip_rows: int = 0, 438 n_rows: int | None = None, 439 schema_sample_rows: int | None = 1_000, 440 dtype_coercion: Literal["coerce", "strict"] = "coerce", 441 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 442 dtypes: DTypeMap | None = None, 443 ) -> ExcelSheet: 444 """Loads a sheet by index. 445 446 Refer to `load_sheet` for parameter documentation 447 """ 448 return self.load_sheet( 449 idx, 450 header_row=header_row, 451 column_names=column_names, 452 skip_rows=skip_rows, 453 n_rows=n_rows, 454 schema_sample_rows=schema_sample_rows, 455 dtype_coercion=dtype_coercion, 456 use_columns=use_columns, 457 dtypes=dtypes, 458 ) 459 460 def __repr__(self) -> str: 461 return self._reader.__repr__() 462 463 464def read_excel(source: Path | str | bytes) -> ExcelReader: 465 """Opens and loads an excel file. 466 467 :param source: The path to a file or its content as bytes 468 """ 469 if isinstance(source, (str, Path)): 470 source = expanduser(source) 471 return ExcelReader(_read_excel(source)) 472 473 474__all__ = ( 475 ## version 476 "__version__", 477 ## main entrypoint 478 "read_excel", 479 ## Python types 480 "DType", 481 "DTypeMap", 482 # Excel reader 483 "ExcelReader", 484 # Excel sheet 485 "ExcelSheet", 486 # Column metadata 487 "DTypeFrom", 488 "ColumnNameFrom", 489 "ColumnInfo", 490 # Exceptions 491 "FastExcelError", 492 "CannotRetrieveCellDataError", 493 "CalamineCellError", 494 "CalamineError", 495 "SheetNotFoundError", 496 "ColumnNotFoundError", 497 "ArrowError", 498 "InvalidParametersError", 499 "UnsupportedColumnTypeCombinationError", 500)
465def read_excel(source: Path | str | bytes) -> ExcelReader: 466 """Opens and loads an excel file. 467 468 :param source: The path to a file or its content as bytes 469 """ 470 if isinstance(source, (str, Path)): 471 source = expanduser(source) 472 return ExcelReader(_read_excel(source))
Opens and loads an excel file.
Parameters
- source: The path to a file or its content as bytes
195class ExcelReader: 196 """A class representing an open Excel file and allowing to read its sheets""" 197 198 def __init__(self, reader: _ExcelReader) -> None: 199 self._reader = reader 200 201 @property 202 def sheet_names(self) -> list[str]: 203 """The list of sheet names""" 204 return self._reader.sheet_names 205 206 def load_sheet( 207 self, 208 idx_or_name: int | str, 209 *, 210 header_row: int | None = 0, 211 column_names: list[str] | None = None, 212 skip_rows: int = 0, 213 n_rows: int | None = None, 214 schema_sample_rows: int | None = 1_000, 215 dtype_coercion: Literal["coerce", "strict"] = "coerce", 216 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 217 dtypes: DTypeMap | None = None, 218 ) -> ExcelSheet: 219 """Loads a sheet lazily by index or name. 220 221 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 222 :param header_row: The index of the row containing the column labels, default index is 0. 223 If `None`, the sheet does not have any column labels. 224 Any rows before the `header_row` will be automatically skipped. 225 :param column_names: Overrides headers found in the document. 226 If `column_names` is used, `header_row` will be ignored. 227 :param n_rows: Specifies how many rows should be loaded. 228 If `None`, all rows are loaded 229 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 230 Any rows before the `header_row` are automatically skipped. 231 If `header_row` is `None`, it skips the number of rows from the 232 start of the sheet. 233 :param schema_sample_rows: Specifies how many rows should be used to determine 234 the dtype of a column. 235 If `None`, all rows will be used. 236 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 237 will try to coerce different dtypes in a column to the same one, 238 whereas `strict` will raise an error in case a column contains 239 several dtypes. Note that this only applies to columns whose dtype 240 is guessed, i.e. not specified via `dtypes`. 241 :param use_columns: Specifies the columns to use. Can either be: 242 - `None` to select all columns 243 - A list of strings and ints, the column names and/or indices 244 (starting at 0) 245 - A string, a comma separated list of Excel column letters and column 246 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 247 `A,B,C,D,E` and `A,C,E,F`) 248 - A callable, a function that takes a column and returns a boolean 249 indicating whether the column should be used 250 :param dtypes: An optional dict of dtypes. Keys can be column indices or names 251 """ 252 return ExcelSheet( 253 self._reader.load_sheet( 254 idx_or_name=idx_or_name, 255 header_row=header_row, 256 column_names=column_names, 257 skip_rows=skip_rows, 258 n_rows=n_rows, 259 schema_sample_rows=schema_sample_rows, 260 dtype_coercion=dtype_coercion, 261 use_columns=use_columns, 262 dtypes=dtypes, 263 eager=False, 264 ) 265 ) 266 267 def table_names(self, sheet_idx_or_name: str | int | None = None) -> list[str]: 268 """The list of table names. 269 270 Will return an empty list if no tables are found. 271 272 :param sheet_idx_or_name: If given, will limit the list to the given sheet, will be faster 273 too. 274 """ 275 return self._reader.table_names(sheet_idx_or_name) 276 277 @typing.overload 278 def load_table( 279 self, 280 name: str, 281 *, 282 header_row: int | None = None, 283 column_names: list[str] | None = None, 284 skip_rows: int = 0, 285 n_rows: int | None = None, 286 schema_sample_rows: int | None = 1_000, 287 dtype_coercion: Literal["coerce", "strict"] = "coerce", 288 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 289 dtypes: DTypeMap | None = None, 290 eager: Literal[False] = ..., 291 ) -> ExcelTable: ... 292 @typing.overload 293 def load_table( 294 self, 295 name: str, 296 *, 297 header_row: int | None = None, 298 column_names: list[str] | None = None, 299 skip_rows: int = 0, 300 n_rows: int | None = None, 301 schema_sample_rows: int | None = 1_000, 302 dtype_coercion: Literal["coerce", "strict"] = "coerce", 303 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 304 dtypes: DTypeMap | None = None, 305 eager: Literal[True] = ..., 306 ) -> pa.RecordBatch: ... 307 def load_table( 308 self, 309 name: str, 310 *, 311 header_row: int | None = None, 312 column_names: list[str] | None = None, 313 skip_rows: int = 0, 314 n_rows: int | None = None, 315 schema_sample_rows: int | None = 1_000, 316 dtype_coercion: Literal["coerce", "strict"] = "coerce", 317 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 318 dtypes: DTypeMap | None = None, 319 eager: bool = False, 320 ) -> ExcelTable | pa.RecordBatch: 321 """Loads a table by name. 322 323 :param name: The name of the table to load. 324 :param header_row: The index of the row containing the column labels. 325 If `None`, the table's column names will be used. 326 Any rows before the `header_row` will be automatically skipped. 327 :param column_names: Overrides headers found in the document. 328 If `column_names` is used, `header_row` will be ignored. 329 :param n_rows: Specifies how many rows should be loaded. 330 If `None`, all rows are loaded 331 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 332 Any rows before the `header_row` are automatically skipped. 333 If `header_row` is `None`, it skips the number of rows from the 334 start of the sheet. 335 :param schema_sample_rows: Specifies how many rows should be used to determine 336 the dtype of a column. 337 If `None`, all rows will be used. 338 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 339 will try to coerce different dtypes in a column to the same one, 340 whereas `strict` will raise an error in case a column contains 341 several dtypes. Note that this only applies to columns whose dtype 342 is guessed, i.e. not specified via `dtypes`. 343 :param use_columns: Specifies the columns to use. Can either be: 344 - `None` to select all columns 345 - A list of strings and ints, the column names and/or indices 346 (starting at 0) 347 - A string, a comma separated list of Excel column letters and column 348 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 349 `A,B,C,D,E` and `A,C,E,F`) 350 - A callable, a function that takes a column and returns a boolean 351 indicating whether the column should be used 352 :param dtypes: An optional dict of dtypes. Keys can be column indices or names 353 """ 354 output = self._reader.load_table( # type:ignore[call-overload,misc] 355 name=name, 356 header_row=header_row, 357 column_names=column_names, 358 skip_rows=skip_rows, 359 n_rows=n_rows, 360 schema_sample_rows=schema_sample_rows, 361 dtype_coercion=dtype_coercion, 362 use_columns=use_columns, 363 dtypes=dtypes, 364 eager=eager, 365 ) 366 if eager: 367 return output 368 return ExcelTable(output) 369 370 def load_sheet_eager( 371 self, 372 idx_or_name: int | str, 373 *, 374 header_row: int | None = 0, 375 column_names: list[str] | None = None, 376 skip_rows: int = 0, 377 n_rows: int | None = None, 378 schema_sample_rows: int | None = 1_000, 379 dtype_coercion: Literal["coerce", "strict"] = "coerce", 380 use_columns: list[str] | list[int] | str | None = None, 381 dtypes: DTypeMap | None = None, 382 ) -> pa.RecordBatch: 383 """Loads a sheet eagerly by index or name. 384 385 For xlsx files, this will be faster and more memory-efficient, as it will use 386 `worksheet_range_ref` under the hood, which returns borrowed types. 387 388 Refer to `load_sheet` for parameter documentation 389 """ 390 return self._reader.load_sheet( 391 idx_or_name=idx_or_name, 392 header_row=header_row, 393 column_names=column_names, 394 skip_rows=skip_rows, 395 n_rows=n_rows, 396 schema_sample_rows=schema_sample_rows, 397 dtype_coercion=dtype_coercion, 398 use_columns=use_columns, 399 dtypes=dtypes, 400 eager=True, 401 ) 402 403 def load_sheet_by_name( 404 self, 405 name: str, 406 *, 407 header_row: int | None = 0, 408 column_names: list[str] | None = None, 409 skip_rows: int = 0, 410 n_rows: int | None = None, 411 schema_sample_rows: int | None = 1_000, 412 dtype_coercion: Literal["coerce", "strict"] = "coerce", 413 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 414 dtypes: DTypeMap | None = None, 415 ) -> ExcelSheet: 416 """Loads a sheet by name. 417 418 Refer to `load_sheet` for parameter documentation 419 """ 420 return self.load_sheet( 421 name, 422 header_row=header_row, 423 column_names=column_names, 424 skip_rows=skip_rows, 425 n_rows=n_rows, 426 schema_sample_rows=schema_sample_rows, 427 dtype_coercion=dtype_coercion, 428 use_columns=use_columns, 429 dtypes=dtypes, 430 ) 431 432 def load_sheet_by_idx( 433 self, 434 idx: int, 435 *, 436 header_row: int | None = 0, 437 column_names: list[str] | None = None, 438 skip_rows: int = 0, 439 n_rows: int | None = None, 440 schema_sample_rows: int | None = 1_000, 441 dtype_coercion: Literal["coerce", "strict"] = "coerce", 442 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 443 dtypes: DTypeMap | None = None, 444 ) -> ExcelSheet: 445 """Loads a sheet by index. 446 447 Refer to `load_sheet` for parameter documentation 448 """ 449 return self.load_sheet( 450 idx, 451 header_row=header_row, 452 column_names=column_names, 453 skip_rows=skip_rows, 454 n_rows=n_rows, 455 schema_sample_rows=schema_sample_rows, 456 dtype_coercion=dtype_coercion, 457 use_columns=use_columns, 458 dtypes=dtypes, 459 ) 460 461 def __repr__(self) -> str: 462 return self._reader.__repr__()
A class representing an open Excel file and allowing to read its sheets
201 @property 202 def sheet_names(self) -> list[str]: 203 """The list of sheet names""" 204 return self._reader.sheet_names
The list of sheet names
206 def load_sheet( 207 self, 208 idx_or_name: int | str, 209 *, 210 header_row: int | None = 0, 211 column_names: list[str] | None = None, 212 skip_rows: int = 0, 213 n_rows: int | None = None, 214 schema_sample_rows: int | None = 1_000, 215 dtype_coercion: Literal["coerce", "strict"] = "coerce", 216 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 217 dtypes: DTypeMap | None = None, 218 ) -> ExcelSheet: 219 """Loads a sheet lazily by index or name. 220 221 :param idx_or_name: The index (starting at 0) or the name of the sheet to load. 222 :param header_row: The index of the row containing the column labels, default index is 0. 223 If `None`, the sheet does not have any column labels. 224 Any rows before the `header_row` will be automatically skipped. 225 :param column_names: Overrides headers found in the document. 226 If `column_names` is used, `header_row` will be ignored. 227 :param n_rows: Specifies how many rows should be loaded. 228 If `None`, all rows are loaded 229 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 230 Any rows before the `header_row` are automatically skipped. 231 If `header_row` is `None`, it skips the number of rows from the 232 start of the sheet. 233 :param schema_sample_rows: Specifies how many rows should be used to determine 234 the dtype of a column. 235 If `None`, all rows will be used. 236 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 237 will try to coerce different dtypes in a column to the same one, 238 whereas `strict` will raise an error in case a column contains 239 several dtypes. Note that this only applies to columns whose dtype 240 is guessed, i.e. not specified via `dtypes`. 241 :param use_columns: Specifies the columns to use. Can either be: 242 - `None` to select all columns 243 - A list of strings and ints, the column names and/or indices 244 (starting at 0) 245 - A string, a comma separated list of Excel column letters and column 246 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 247 `A,B,C,D,E` and `A,C,E,F`) 248 - A callable, a function that takes a column and returns a boolean 249 indicating whether the column should be used 250 :param dtypes: An optional dict of dtypes. Keys can be column indices or names 251 """ 252 return ExcelSheet( 253 self._reader.load_sheet( 254 idx_or_name=idx_or_name, 255 header_row=header_row, 256 column_names=column_names, 257 skip_rows=skip_rows, 258 n_rows=n_rows, 259 schema_sample_rows=schema_sample_rows, 260 dtype_coercion=dtype_coercion, 261 use_columns=use_columns, 262 dtypes=dtypes, 263 eager=False, 264 ) 265 )
Loads a sheet lazily by index or name.
Parameters
- idx_or_name: The index (starting at 0) or the name of the sheet to load.
- header_row: The index of the row containing the column labels, default index is 0.
If
None
, the sheet does not have any column labels. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If
None
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dict of dtypes. Keys can be column indices or names
267 def table_names(self, sheet_idx_or_name: str | int | None = None) -> list[str]: 268 """The list of table names. 269 270 Will return an empty list if no tables are found. 271 272 :param sheet_idx_or_name: If given, will limit the list to the given sheet, will be faster 273 too. 274 """ 275 return self._reader.table_names(sheet_idx_or_name)
The list of table names.
Will return an empty list if no tables are found.
Parameters
- sheet_idx_or_name: If given, will limit the list to the given sheet, will be faster too.
307 def load_table( 308 self, 309 name: str, 310 *, 311 header_row: int | None = None, 312 column_names: list[str] | None = None, 313 skip_rows: int = 0, 314 n_rows: int | None = None, 315 schema_sample_rows: int | None = 1_000, 316 dtype_coercion: Literal["coerce", "strict"] = "coerce", 317 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 318 dtypes: DTypeMap | None = None, 319 eager: bool = False, 320 ) -> ExcelTable | pa.RecordBatch: 321 """Loads a table by name. 322 323 :param name: The name of the table to load. 324 :param header_row: The index of the row containing the column labels. 325 If `None`, the table's column names will be used. 326 Any rows before the `header_row` will be automatically skipped. 327 :param column_names: Overrides headers found in the document. 328 If `column_names` is used, `header_row` will be ignored. 329 :param n_rows: Specifies how many rows should be loaded. 330 If `None`, all rows are loaded 331 :param skip_rows: Specifies how many rows should be skipped after the `header_row`. 332 Any rows before the `header_row` are automatically skipped. 333 If `header_row` is `None`, it skips the number of rows from the 334 start of the sheet. 335 :param schema_sample_rows: Specifies how many rows should be used to determine 336 the dtype of a column. 337 If `None`, all rows will be used. 338 :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) 339 will try to coerce different dtypes in a column to the same one, 340 whereas `strict` will raise an error in case a column contains 341 several dtypes. Note that this only applies to columns whose dtype 342 is guessed, i.e. not specified via `dtypes`. 343 :param use_columns: Specifies the columns to use. Can either be: 344 - `None` to select all columns 345 - A list of strings and ints, the column names and/or indices 346 (starting at 0) 347 - A string, a comma separated list of Excel column letters and column 348 ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in 349 `A,B,C,D,E` and `A,C,E,F`) 350 - A callable, a function that takes a column and returns a boolean 351 indicating whether the column should be used 352 :param dtypes: An optional dict of dtypes. Keys can be column indices or names 353 """ 354 output = self._reader.load_table( # type:ignore[call-overload,misc] 355 name=name, 356 header_row=header_row, 357 column_names=column_names, 358 skip_rows=skip_rows, 359 n_rows=n_rows, 360 schema_sample_rows=schema_sample_rows, 361 dtype_coercion=dtype_coercion, 362 use_columns=use_columns, 363 dtypes=dtypes, 364 eager=eager, 365 ) 366 if eager: 367 return output 368 return ExcelTable(output)
Loads a table by name.
Parameters
- name: The name of the table to load.
- header_row: The index of the row containing the column labels.
If
None
, the table's column names will be used. Any rows before theheader_row
will be automatically skipped. - column_names: Overrides headers found in the document.
If
column_names
is used,header_row
will be ignored. - n_rows: Specifies how many rows should be loaded.
If
None
, all rows are loaded - skip_rows: Specifies how many rows should be skipped after the
header_row
. Any rows before theheader_row
are automatically skipped. Ifheader_row
isNone
, it skips the number of rows from the start of the sheet. - schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If
None
, all rows will be used. - dtype_coercion: Specifies how type coercion should behave.
coerce
(the default) will try to coerce different dtypes in a column to the same one, whereasstrict
will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified viadtypes
. - use_columns: Specifies the columns to use. Can either be:
None
to select all columns- A list of strings and ints, the column names and/or indices (starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g.
“A:E”
or“A,C,E:F”
, which would result inA,B,C,D,E
andA,C,E,F
) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used
- dtypes: An optional dict of dtypes. Keys can be column indices or names
370 def load_sheet_eager( 371 self, 372 idx_or_name: int | str, 373 *, 374 header_row: int | None = 0, 375 column_names: list[str] | None = None, 376 skip_rows: int = 0, 377 n_rows: int | None = None, 378 schema_sample_rows: int | None = 1_000, 379 dtype_coercion: Literal["coerce", "strict"] = "coerce", 380 use_columns: list[str] | list[int] | str | None = None, 381 dtypes: DTypeMap | None = None, 382 ) -> pa.RecordBatch: 383 """Loads a sheet eagerly by index or name. 384 385 For xlsx files, this will be faster and more memory-efficient, as it will use 386 `worksheet_range_ref` under the hood, which returns borrowed types. 387 388 Refer to `load_sheet` for parameter documentation 389 """ 390 return self._reader.load_sheet( 391 idx_or_name=idx_or_name, 392 header_row=header_row, 393 column_names=column_names, 394 skip_rows=skip_rows, 395 n_rows=n_rows, 396 schema_sample_rows=schema_sample_rows, 397 dtype_coercion=dtype_coercion, 398 use_columns=use_columns, 399 dtypes=dtypes, 400 eager=True, 401 )
Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
worksheet_range_ref
under the hood, which returns borrowed types.
Refer to load_sheet
for parameter documentation
403 def load_sheet_by_name( 404 self, 405 name: str, 406 *, 407 header_row: int | None = 0, 408 column_names: list[str] | None = None, 409 skip_rows: int = 0, 410 n_rows: int | None = None, 411 schema_sample_rows: int | None = 1_000, 412 dtype_coercion: Literal["coerce", "strict"] = "coerce", 413 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 414 dtypes: DTypeMap | None = None, 415 ) -> ExcelSheet: 416 """Loads a sheet by name. 417 418 Refer to `load_sheet` for parameter documentation 419 """ 420 return self.load_sheet( 421 name, 422 header_row=header_row, 423 column_names=column_names, 424 skip_rows=skip_rows, 425 n_rows=n_rows, 426 schema_sample_rows=schema_sample_rows, 427 dtype_coercion=dtype_coercion, 428 use_columns=use_columns, 429 dtypes=dtypes, 430 )
Loads a sheet by name.
Refer to load_sheet
for parameter documentation
432 def load_sheet_by_idx( 433 self, 434 idx: int, 435 *, 436 header_row: int | None = 0, 437 column_names: list[str] | None = None, 438 skip_rows: int = 0, 439 n_rows: int | None = None, 440 schema_sample_rows: int | None = 1_000, 441 dtype_coercion: Literal["coerce", "strict"] = "coerce", 442 use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, 443 dtypes: DTypeMap | None = None, 444 ) -> ExcelSheet: 445 """Loads a sheet by index. 446 447 Refer to `load_sheet` for parameter documentation 448 """ 449 return self.load_sheet( 450 idx, 451 header_row=header_row, 452 column_names=column_names, 453 skip_rows=skip_rows, 454 n_rows=n_rows, 455 schema_sample_rows=schema_sample_rows, 456 dtype_coercion=dtype_coercion, 457 use_columns=use_columns, 458 dtypes=dtypes, 459 )
Loads a sheet by index.
Refer to load_sheet
for parameter documentation
55class ExcelSheet: 56 """A class representing a single sheet in an Excel File""" 57 58 def __init__(self, sheet: _ExcelSheet) -> None: 59 self._sheet = sheet 60 61 @property 62 def name(self) -> str: 63 """The name of the sheet""" 64 return self._sheet.name 65 66 @property 67 def width(self) -> int: 68 """The sheet's width""" 69 return self._sheet.width 70 71 @property 72 def height(self) -> int: 73 """The sheet's height, with `skip_rows` and `nrows` applied""" 74 return self._sheet.height 75 76 @property 77 def total_height(self) -> int: 78 """The sheet's total height""" 79 return self._sheet.total_height 80 81 @property 82 def selected_columns(self) -> list[ColumnInfo]: 83 """The sheet's selected columns""" 84 return self._sheet.selected_columns 85 86 @property 87 def available_columns(self) -> list[ColumnInfo]: 88 """The columns available for the given sheet""" 89 return self._sheet.available_columns 90 91 @property 92 def specified_dtypes(self) -> DTypeMap | None: 93 """The dtypes specified for the sheet""" 94 return self._sheet.specified_dtypes 95 96 @property 97 def visible(self) -> SheetVisible: 98 """The visibility of the sheet""" 99 return self._sheet.visible 100 101 def to_arrow(self) -> pa.RecordBatch: 102 """Converts the sheet to a pyarrow `RecordBatch`""" 103 return self._sheet.to_arrow() 104 105 def to_pandas(self) -> "pd.DataFrame": 106 """Converts the sheet to a Pandas `DataFrame`. 107 108 Requires the `pandas` extra to be installed. 109 """ 110 # We know for sure that the sheet will yield exactly one RecordBatch 111 return self.to_arrow().to_pandas() 112 113 def to_polars(self) -> "pl.DataFrame": 114 """Converts the sheet to a Polars `DataFrame`. 115 116 Requires the `polars` extra to be installed. 117 """ 118 return _recordbatch_to_polars(self.to_arrow()) 119 120 def __repr__(self) -> str: 121 return self._sheet.__repr__()
A class representing a single sheet in an Excel File
71 @property 72 def height(self) -> int: 73 """The sheet's height, with `skip_rows` and `nrows` applied""" 74 return self._sheet.height
The sheet's height, with skip_rows
and nrows
applied
76 @property 77 def total_height(self) -> int: 78 """The sheet's total height""" 79 return self._sheet.total_height
The sheet's total height
81 @property 82 def selected_columns(self) -> list[ColumnInfo]: 83 """The sheet's selected columns""" 84 return self._sheet.selected_columns
The sheet's selected columns
86 @property 87 def available_columns(self) -> list[ColumnInfo]: 88 """The columns available for the given sheet""" 89 return self._sheet.available_columns
The columns available for the given sheet
91 @property 92 def specified_dtypes(self) -> DTypeMap | None: 93 """The dtypes specified for the sheet""" 94 return self._sheet.specified_dtypes
The dtypes specified for the sheet
96 @property 97 def visible(self) -> SheetVisible: 98 """The visibility of the sheet""" 99 return self._sheet.visible
The visibility of the sheet
101 def to_arrow(self) -> pa.RecordBatch: 102 """Converts the sheet to a pyarrow `RecordBatch`""" 103 return self._sheet.to_arrow()
Converts the sheet to a pyarrow RecordBatch
105 def to_pandas(self) -> "pd.DataFrame": 106 """Converts the sheet to a Pandas `DataFrame`. 107 108 Requires the `pandas` extra to be installed. 109 """ 110 # We know for sure that the sheet will yield exactly one RecordBatch 111 return self.to_arrow().to_pandas()
Converts the sheet to a Pandas DataFrame
.
Requires the pandas
extra to be installed.
113 def to_polars(self) -> "pl.DataFrame": 114 """Converts the sheet to a Polars `DataFrame`. 115 116 Requires the `polars` extra to be installed. 117 """ 118 return _recordbatch_to_polars(self.to_arrow())
Converts the sheet to a Polars DataFrame
.
Requires the polars
extra to be installed.
This class provides information about a single column in a sheet
fastexcel.ColumnNameFrom
. How the name of the column was determined.
One of three possible values:
"provided"
: The column name was provided via theuse_columns
parameter"looked_up"
: The column name was looked up from the data found in the sheet"generated"
: The column name was generated from the column index, either becauseheader_row
wasNone
, or because it could not be looked up
fastexcel.DTypeFrom
. How the dtype of the column was determined.
One of three possible values:
"provided_by_index"
: The dtype was specified via the column index"provided_by_name"
: The dtype was specified via the column name"guessed"
: The dtype was determined from the content of the column
The base class for all fastexcel errors
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
Data for a given cell cannot be retrieved
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
calamine returned an error regarding the content of the cell
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
Generic calamine error
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
Sheet was not found
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
Column was not found
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
Generic arrow error
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
Provided parameters are invalid
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args
Column contains an unsupported type combination
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args