Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,13 +384,6 @@ def search(
dataset_name=dataset_name
)

if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
raise ValueError(
'Hyperparameter optimization requires a validation split. '
'Expected `self.resampling_strategy` to be either '
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
)

return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand Down
7 changes: 0 additions & 7 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,13 +384,6 @@ def search(
dataset_name=dataset_name
)

if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
raise ValueError(
'Hyperparameter optimization requires a validation split. '
'Expected `self.resampling_strategy` to be either '
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
)

return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand Down
56 changes: 35 additions & 21 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,12 @@ def _fit(
all_nan_columns = X.columns[X.isna().all()]
for col in all_nan_columns:
X[col] = pd.to_numeric(X[col])

# Handle objects if possible
exist_object_columns = has_object_columns(X.dtypes.values)
if exist_object_columns:
X = self.infer_objects(X)

self.dtypes = [dt.name for dt in X.dtypes] # Also note this change in self.dtypes
self.all_nan_columns = set(all_nan_columns)

Expand Down Expand Up @@ -260,20 +266,22 @@ def transform(

if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
X = cast(Type[pd.DataFrame], X)
if self.all_nan_columns is not None:
for column in X.columns:
if column in self.all_nan_columns:
if not X[column].isna().all():
X[column] = np.nan
X[column] = pd.to_numeric(X[column])
if len(self.categorical_columns) > 0:
if self.column_transformer is None:
raise AttributeError("Expect column transformer to be built"
"if there are categorical columns")
categorical_columns = self.column_transformer.transformers_[0][-1]
for column in categorical_columns:
if X[column].isna().all():
X[column] = X[column].astype('object')

if self.all_nan_columns is None:
raise ValueError('_fit must be called before calling transform')

for col in list(self.all_nan_columns):
X[col] = np.nan
X[col] = pd.to_numeric(X[col])

if len(self.categorical_columns) > 0:
if self.column_transformer is None:
raise AttributeError("Expect column transformer to be built"
"if there are categorical columns")
categorical_columns = self.column_transformer.transformers_[0][-1]
for column in categorical_columns:
if X[column].isna().all():
X[column] = X[column].astype('object')

# Check the data here so we catch problems on new test data
self._check_data(X)
Expand Down Expand Up @@ -366,10 +374,10 @@ def _check_data(
self.column_order = column_order

dtypes = [dtype.name for dtype in X.dtypes]

diff_cols = X.columns[[s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]]
if len(self.dtypes) == 0:
self.dtypes = dtypes
elif self.dtypes != dtypes:
elif not self._is_datasets_consistent(diff_cols, X):
raise ValueError("The dtype of the features must not be changed after fit(), but"
" the dtypes of some columns are different between training ({}) and"
" test ({}) datasets.".format(self.dtypes, dtypes))
Expand Down Expand Up @@ -517,11 +525,17 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}')
pass
else:
# Calling for the first time to infer the categories
X = X.infer_objects()
for column, data_type in zip(X.columns, X.dtypes):
if not is_numeric_dtype(data_type):
X[column] = X[column].astype('category')
if len(self.dtypes) != 0:
# when train data has no object dtype, but test does
# we prioritise the datatype given in training data
for column, data_type in zip(X.columns, self.dtypes):
X[column] = X[column].astype(data_type)
else:
# Calling for the first time to infer the categories
X = X.infer_objects()
for column, data_type in zip(X.columns, X.dtypes):
if not is_numeric_dtype(data_type):
X[column] = X[column].astype('category')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if len(self.dtypes) != 0:
# when train data has no object dtype, but test does
# we prioritise the datatype given in training data
for column, data_type in zip(X.columns, self.dtypes):
X[column] = X[column].astype(data_type)
else:
# Calling for the first time to infer the categories
X = X.infer_objects()
for column, data_type in zip(X.columns, X.dtypes):
if not is_numeric_dtype(data_type):
X[column] = X[column].astype('category')
elif len(self.dtypes) != 0: # when train data has no object dtype, but test does
# we prioritise the datatype given in training data
for column, data_type in zip(X.columns, self.dtypes):
X[column] = X[column].astype(data_type)
else: # Calling for the first time to infer the categories
X = X.infer_objects()
for column, data_type in zip(X.columns, X.dtypes):
if not is_numeric_dtype(data_type):
X[column] = X[column].astype('category')

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these are just preferences on where to start the comment.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nono, it actually removed an indent level.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, if you notice we are also saving the dtypes in self.object_dtype_mapping which should be done for both of the two conditions you moved back an indent level. So, I think its fine the way it is.

Copy link
Collaborator

@nabenabe0928 nabenabe0928 Feb 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yeah, I did not notice, but I also did not notice that we still have the same issue (which happens when we have a huge number of features) in this method.
Could you fix it?

        if hasattr(self, 'object_dtype_mapping'):
            # Mypy does not process the has attr. This dict is defined below
            try:
                X = X.astype(self.object_dtype_mapping)
            except Exception as e:
                self.logger.warning(f'Casting test data to data type in train data caused the exception {e}')
                pass
            return

        if len(self.dtypes) != 0:
            # when train data has no object dtype, but test does.  Prioritise the datatype given in training data
            dtype_dict = {col: dtype for col, dtype in zip(X.columns, self.dtypes)}
            X = X.astype(dtype_dict)
        else:
            # Calling for the first time to infer the categories
            X = X.infer_objects()
            dtype_dict = {col: 'category' for col, dtype in zip(X.columns, X.dtypes) if not is_numeric_dtype(dtype)}
            X = X.astype(dtype_dict)

        # only numerical attributes and categories
        self.object_dtype_mapping = {col: dtype for col, dtype in zip(X.columns, X.dtypes)}


# only numerical attributes and categories
self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
Expand Down
Loading