Skip to content

Commit 337d5fe

Browse files
authored
REF: separate out helpers in libparser (#61832)
1 parent d785a3d commit 337d5fe

File tree

1 file changed

+47
-39
lines changed

1 file changed

+47
-39
lines changed

pandas/_libs/parsers.pyx

Lines changed: 47 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ cdef class TextReader:
340340
cdef:
341341
parser_t *parser
342342
object na_fvalues
343-
object true_values, false_values
343+
list true_values, false_values
344344
object handle
345345
object orig_header
346346
bint na_filter, keep_default_na, has_usecols, has_mi_columns
@@ -942,6 +942,7 @@ cdef class TextReader:
942942
bint na_filter = 0
943943
int64_t num_cols
944944
dict results
945+
bint is_default_dict_dtype
945946

946947
start = self.parser_start
947948

@@ -957,26 +958,7 @@ cdef class TextReader:
957958
self.parser.line_fields[i] + \
958959
(num_cols >= self.parser.line_fields[i]) * num_cols
959960

960-
usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols
961-
names_larger_num_cols = (self.names and
962-
len(self.names) - self.leading_cols > num_cols)
963-
964-
if self.table_width - self.leading_cols > num_cols:
965-
if (usecols_not_callable_and_exists
966-
and self.table_width - self.leading_cols < len(self.usecols)
967-
or names_larger_num_cols):
968-
raise ParserError(f"Too many columns specified: expected "
969-
f"{self.table_width - self.leading_cols} "
970-
f"and found {num_cols}")
971-
972-
if (usecols_not_callable_and_exists and
973-
all(isinstance(u, int) for u in self.usecols)):
974-
missing_usecols = [col for col in self.usecols if col >= num_cols]
975-
if missing_usecols:
976-
raise ParserError(
977-
"Defining usecols with out-of-bounds indices is not allowed. "
978-
f"{missing_usecols} are out of bounds.",
979-
)
961+
self._validate_usecols_and_names(num_cols)
980962

981963
results = {}
982964
nused = 0
@@ -1004,22 +986,7 @@ cdef class TextReader:
1004986
nused += 1
1005987

1006988
conv = self._get_converter(i, name)
1007-
1008-
col_dtype = None
1009-
if self.dtype is not None:
1010-
if isinstance(self.dtype, dict):
1011-
if name in self.dtype:
1012-
col_dtype = self.dtype[name]
1013-
elif i in self.dtype:
1014-
col_dtype = self.dtype[i]
1015-
elif is_default_dict_dtype:
1016-
col_dtype = self.dtype[name]
1017-
else:
1018-
if self.dtype.names:
1019-
# structured array
1020-
col_dtype = np.dtype(self.dtype.descr[i][1])
1021-
else:
1022-
col_dtype = self.dtype
989+
col_dtype = self._get_col_dtype(i, is_default_dict_dtype, name)
1023990

1024991
if conv:
1025992
if col_dtype is not None:
@@ -1267,6 +1234,47 @@ cdef class TextReader:
12671234
return _string_box_utf8(self.parser, i, start, end, na_filter,
12681235
na_hashset, self.encoding_errors)
12691236

1237+
cdef void _validate_usecols_and_names(self, int num_cols):
1238+
usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols
1239+
names_larger_num_cols = (self.names and
1240+
len(self.names) - self.leading_cols > num_cols)
1241+
1242+
if self.table_width - self.leading_cols > num_cols:
1243+
if (usecols_not_callable_and_exists
1244+
and self.table_width - self.leading_cols < len(self.usecols)
1245+
or names_larger_num_cols):
1246+
raise ParserError(f"Too many columns specified: expected "
1247+
f"{self.table_width - self.leading_cols} "
1248+
f"and found {num_cols}")
1249+
1250+
if (usecols_not_callable_and_exists and
1251+
all(isinstance(u, int) for u in self.usecols)):
1252+
missing_usecols = [col for col in self.usecols if col >= num_cols]
1253+
if missing_usecols:
1254+
raise ParserError(
1255+
"Defining usecols with out-of-bounds indices is not allowed. "
1256+
f"{missing_usecols} are out of bounds.",
1257+
)
1258+
1259+
# -> DtypeObj
1260+
cdef object _get_col_dtype(self, int64_t i, bint is_default_dict_dtype, name):
1261+
col_dtype = None
1262+
if self.dtype is not None:
1263+
if isinstance(self.dtype, dict):
1264+
if name in self.dtype:
1265+
col_dtype = self.dtype[name]
1266+
elif i in self.dtype:
1267+
col_dtype = self.dtype[i]
1268+
elif is_default_dict_dtype:
1269+
col_dtype = self.dtype[name]
1270+
else:
1271+
if self.dtype.names:
1272+
# structured array
1273+
col_dtype = np.dtype(self.dtype.descr[i][1])
1274+
else:
1275+
col_dtype = self.dtype
1276+
return col_dtype
1277+
12701278
def _get_converter(self, i: int, name):
12711279
if self.converters is None:
12721280
return None
@@ -1347,8 +1355,8 @@ cdef _close(TextReader reader):
13471355

13481356

13491357
cdef:
1350-
object _true_values = [b"True", b"TRUE", b"true"]
1351-
object _false_values = [b"False", b"FALSE", b"false"]
1358+
list _true_values = [b"True", b"TRUE", b"true"]
1359+
list _false_values = [b"False", b"FALSE", b"false"]
13521360

13531361

13541362
def _ensure_encoded(list lst):

0 commit comments

Comments
 (0)