Source code for ibm_watson_machine_learning.experiment.autoai.autoai

#  -----------------------------------------------------------------------------------------
#  (C) Copyright IBM Corp. 2020- 2023.
#  https://opensource.org/licenses/BSD-3-Clause
#  -----------------------------------------------------------------------------------------

import copy
from typing import List, Union
from warnings import warn

from ibm_watson_machine_learning.preprocessing import DataJoinGraph
from ibm_watson_machine_learning.utils.autoai.enums import (
    TShirtSize, ClassificationAlgorithms, RegressionAlgorithms, ForecastingAlgorithms, PredictionType, Metrics, \
    Transformers, DataConnectionTypes, PipelineTypes, PositiveLabelClass, ClassificationAlgorithmsCP4D,
    RegressionAlgorithmsCP4D, ForecastingAlgorithmsCP4D, SamplingTypes, ImputationStrategy, ForecastingPipelineTypes,
    TimeseriesAnomalyPredictionPipelineTypes, TimeseriesAnomalyPredictionAlgorithms)
from ibm_watson_machine_learning.utils.autoai.errors import LocalInstanceButRemoteParameter, MissingPositiveLabel, \
    NonForecastPredictionColumnMissing, ForecastPredictionColumnsMissing, ForecastingCannotBeRunAsLocalScenario, \
    TSNotSupported, TSADNotSupported, ParamOutOfRange, ImputationListNotSupported, \
    MissingEstimatorForExistingBatchedEstimator, TimeseriesAnomalyPredictionFeatureColumnsMissing, \
    TimeseriesAnomalyPredictionCannotBeRunAsLocalScenario, TimeseriesAnomalyPredictionUnsupportedMetric
from ibm_watson_machine_learning.utils.autoai.utils import check_dependencies_versions, \
    validate_additional_params_for_optimizer, validate_optimizer_enum_values, \
    translate_imputation_string_strategy_to_enum, translate_estimator_string_to_enum, \
    translate_batched_estimator_string_to_enum
from ibm_watson_machine_learning.workspace import WorkSpace
from ibm_watson_machine_learning.wml_client_error import ForbiddenActionForGitBasedProject, WMLClientError
from ibm_watson_machine_learning.messages.messages import Messages
from .engines import WMLEngine
from .optimizers import LocalAutoPipelines, RemoteAutoPipelines
from .runs import AutoPipelinesRuns, LocalAutoPipelinesRuns
from ..base_experiment.base_experiment import BaseExperiment

__all__ = [
    "AutoAI"
]


[docs]class AutoAI(BaseExperiment): """AutoAI class for pipeline models optimization automation. :param wml_credentials: credentials to Watson Machine Learning instance :type wml_credentials: dict :param project_id: ID of the Watson Studio project :type project_id: str, optional :param space_id: ID of the Watson Studio Space :type space_id: str, optional :param verify: user can pass as verify one of following: - the path to a CA_BUNDLE file - the path of directory with certificates of trusted CAs - `True` - default path to truststore will be taken - `False` - no verification will be made :type verify: bool or str, optional **Example** .. code-block:: python from ibm_watson_machine_learning.experiment import AutoAI experiment = AutoAI( wml_credentials={ "apikey": "...", "iam_apikey_description": "...", "iam_apikey_name": "...", "iam_role_crn": "...", "iam_serviceid_crn": "...", "instance_id": "...", "url": "https://us-south.ml.cloud.ibm.com" }, project_id="...", space_id="...") """ # note: initialization of AutoAI enums as class properties # note: Enums with estimators can be overwritten in _init based on environment type (CPD or Cloud) ClassificationAlgorithms = ClassificationAlgorithms RegressionAlgorithms = RegressionAlgorithms ForecastingAlgorithms = ForecastingAlgorithms # end note TShirtSize = TShirtSize PredictionType = PredictionType Metrics = Metrics Transformers = Transformers DataConnectionTypes = DataConnectionTypes PipelineTypes = PipelineTypes SamplingTypes = SamplingTypes def __init__(self, wml_credentials: Union[dict, 'WorkSpace'] = None, project_id: str = None, space_id: str = None, verify=None) -> None: # note: as workspace is not clear enough to understand, there is a possibility to use pure # wml credentials with project and space IDs, but in addition we # leave a possibility to use a previous WorkSpace implementation, it could be passed as a first argument if wml_credentials is None: self._workspace = None self.runs = LocalAutoPipelinesRuns() else: if isinstance(wml_credentials, WorkSpace): self._workspace = wml_credentials else: self._workspace = WorkSpace(wml_credentials=wml_credentials.copy(), project_id=project_id, space_id=space_id, verify=verify) self.project_id = self._workspace.project_id self.space_id = self._workspace.space_id self.runs = AutoPipelinesRuns(engine=WMLEngine(self._workspace)) self.runs._workspace = self._workspace #self._block_autoai_on_git_based_project() self._init_estimator_enums() self._20_class_limit_removal_test = False # --- end note
[docs] def runs(self, *, filter: str) -> Union['AutoPipelinesRuns', 'LocalAutoPipelinesRuns']: """Get the historical runs but with WML Pipeline name filter (for remote scenario). Get the historical runs but with experiment name filter (for local scenario). :param filter: WML Pipeline name to filter the historical runs or experiment name to filter the local historical runs :type filter: str :return: object managing the list of runs :rtype: AutoPipelinesRuns or LocalAutoPipelinesRuns **Example** .. code-block:: python from ibm_watson_machine_learning.experiment import AutoAI experiment = AutoAI(...) experiment.runs(filter='Test').list() """ if self._workspace is None: return LocalAutoPipelinesRuns(filter=filter) else: return AutoPipelinesRuns(engine=WMLEngine(self._workspace.wml_client), filter=filter)
[docs] def optimizer(self, name: str, *, prediction_type: 'PredictionType', prediction_column: str = None, prediction_columns: List[str] = None, timestamp_column_name: str = None, scoring: 'Metrics' = None, desc: str = None, test_size: float = None, # deprecated holdout_size: float = 0.1, max_number_of_estimators: int = None, train_sample_rows_test_size: float = None, include_only_estimators: List[Union['ClassificationAlgorithms', 'RegressionAlgorithms', 'ForecastingAlgorithms', 'TimeseriesAnomalyPredictionAlgorithms']] = None, daub_include_only_estimators: List[Union['ClassificationAlgorithms', 'RegressionAlgorithms']] = None, # deprecated include_batched_ensemble_estimators: List[Union['BatchedClassificationAlgorithms', 'BatchedRegressionAlgorithms']] = None, backtest_num: int = None, lookback_window: int = None, forecast_window: int = None, backtest_gap_length: int = None, feature_columns: List[str] = None, pipeline_types: List[Union['ForecastingPipelineTypes', 'TimeseriesAnomalyPredictionPipelineTypes']] = None, supporting_features_at_forecast: bool = None, cognito_transform_names: List['Transformers'] = None, data_join_graph: 'DataJoinGraph' = None, csv_separator: Union[List[str], str] = ',', excel_sheet: Union[str, int] = None, encoding: str = 'utf-8', positive_label: str = None, data_join_only: bool = False, drop_duplicates: bool = True, outliers_columns: list = None, text_processing: bool = None, word2vec_feature_number: int = None, daub_give_priority_to_runtime: float = None, fairness_info: dict = None, sampling_type: 'SamplingTypes' = None, sample_size_limit: int = None, sample_rows_limit: int = None, sample_percentage_limit: float = None, n_parallel_data_connections: int = None, number_of_batch_rows: int = None, categorical_imputation_strategy: ImputationStrategy = None, numerical_imputation_strategy: ImputationStrategy = None, numerical_imputation_value: float = None, imputation_threshold: float = None, retrain_on_holdout: bool = None, categorical_columns: list = None, numerical_columns: list = None, test_data_csv_separator: Union[List[str], str] = ',', test_data_excel_sheet: str = None, test_data_encoding: str = 'utf-8', confidence_level: float = 0.95, **kwargs) -> Union['RemoteAutoPipelines', 'LocalAutoPipelines']: """ Initialize an AutoAi optimizer. :param name: name for the AutoPipelines :type name: str :param prediction_type: type of the prediction :type prediction_type: PredictionType :param prediction_column: name of the target/label column, required for `multiclass`, `binary` and `regression` prediction types :type prediction_column: str, optional :param prediction_columns: names of the target/label columns, required for `forecasting` prediction type :type prediction_columns: list[str], optional :param timestamp_column_name: name of timestamp column for time series forecasting :type timestamp_column_name: str, optional :param scoring: type of the metric to optimize with, not used for forecasting :type scoring: Metrics, optional :param desc: description :type desc: str, optional :param test_size: deprecated, use `holdout_size` instead :param holdout_size: percentage of the entire dataset to leave as a holdout :type holdout_size: float, optional :param max_number_of_estimators: maximum number (top-K ranked by DAUB model selection) of the selected algorithm, or estimator types, for example `LGBMClassifierEstimator`, `XGBoostClassifierEstimator`, or `LogisticRegressionEstimator` to use in pipeline composition, the default is `None` that means the true default value will be determined by the internal different algorithms, where only the highest ranked by model selection algorithm type is used :type max_number_of_estimators: int, optional :param train_sample_rows_test_size: training data sampling percentage :type train_sample_rows_test_size: float, optional :param daub_include_only_estimators: deprecated, use `include_only_estimators` instead :param include_batched_ensemble_estimators: list of batched ensemble estimators to include in computation process, see: AutoAI.BatchedClassificationAlgorithms, AutoAI.BatchedRegressionAlgorithms :type include_batched_ensemble_estimators: list[BatchedClassificationAlgorithms or BatchedRegressionAlgorithms], optional :param include_only_estimators: list of estimators to include in computation process, see: AutoAI.ClassificationAlgorithms, AutoAI.RegressionAlgorithms or AutoAI.ForecastingAlgorithms :type include_only_estimators: List[ClassificationAlgorithms or RegressionAlgorithms or ForecastingAlgorithms]], optional :param backtest_num: number of backtests used for forecasting prediction type, default value: 4, value from range [0, 20] :type backtest_num: int, optional :param lookback_window: length of lookback window used for forecasting prediction type, default value: 10, if set to -1 lookback window will be auto-detected :type lookback_window: int, optional :param forecast_window: length of forecast window used for forecasting prediction type, default value: 1, value from range [1, 60] :type forecast_window: int, optional :param backtest_gap_length: gap between backtests used for forecasting prediction type, default value: 0, value from range [0, data length / 4] :type backtest_gap_length: int, optional :param feature_columns: list of feature columns used for forecasting prediction type, may contain target column and/or supporting feature columns, list of columns to be detected whether there are anomalies for timeseries anomaly prediction type :type feature_columns: list[str], optional :param pipeline_types: list of pipeline types to be used for forecasting or timeseries anomaly prediction type :type pipeline_types: list[ForecastingPipelineTypes or TimeseriesAnomalyPredictionPipelineTypes], optional :param supporting_features_at_forecast: enables usage of future supporting feature values during forecast :type supporting_features_at_forecast: bool, optional :param cognito_transform_names: list of transformers to include in the feature enginnering computation process, see: AutoAI.Transformers :type cognito_transform_names: list[Transformers], optional :param csv_separator: the separator, or list of separators to try for separating columns in a CSV file, not used if the file_name is not a CSV file, default is ',' :type csv_separator: list[str] or str, optional :param excel_sheet: name of the excel sheet to use, only applicable when xlsx file is an input, support for number of the sheet is deprecated, by default first sheet is used :type excel_sheet: str, optional :param encoding: encoding type for CSV training file :type encoding: str, optional :param positive_label: the positive class to report when binary classification, when multiclass or regression, this will be ignored :type positive_label: str, optional :param t_shirt_size: the size of the remote AutoAI POD instance (computing resources), only applicable to a remote scenario, see: AutoAI.TShirtSize :type t_shirt_size: TShirtSize, optional :param data_join_graph: a graph object with definition of join structure for multiple input data sources, data preprocess step for multiple files :type data_join_graph: DataJoinGraph, optional :param data_join_only: if `True` only preprocessing will be executed :type data_join_only: bool, optional :param drop_duplicates: if `True` duplicated rows in data will be removed before further processing :type drop_duplicates: bool, optional :param outliers_columns: replace outliers with NaN using IQR method for specified columns. By default, turned ON for regression learning_type and target column. To turn OFF empty list of columns must be passed :type outliers_columns: list, optional :param text_processing: if `True` text processing will be enabled, applicable only on Cloud :type text_processing: bool, optional :param word2vec_feature_number: number of features which will be generated from text column, will be applied only if `text_processing` is `True`, if `None` the default value will be taken :type word2vec_feature_number: int, optional :param daub_give_priority_to_runtime: the importance of run time over score for pipelines ranking, can take values between 0 and 5, if set to 0.0 only score is used, if set to 1 equally score and runtime are used, if set to value higher than 1 the runtime gets higher importance over score :type daub_give_priority_to_runtime: float, optional :param fairness_info: dictionary that specifies metadata needed for measuring fairness, it contains three key values: `favorable_labels`, `unfavorable_labels` and `protected_attributes`, the `favorable_labels` attribute indicates that when the class column contains one of the value from list, that is considered a positive outcome, the `unfavorable_labels` is oposite to the `favorable_labels` and is obligatory for regression learning type, a protected attribute is a list of features that partition the population into groups whose outcome should have parity, if protected attribute is empty list then automatic detection of protected attributes will be run, if `fairness_info` is passed then fairness metric will be calculated :type fairness_info: fairness_info :param n_parallel_data_connections: number of maximum parallel connection to data source, supported only for IBM Cloud Pak® for Data 4.0.1 and above :type n_parallel_data_connections: int, optional :param categorical_imputation_strategy: missing values imputation strategy for categorical columns Possible values (only non-forecasting scenario): - ImputationStrategy.MEAN - ImputationStrategy.MEDIAN - ImputationStrategy.MOST_FREQUENT (default) :type categorical_imputation_strategy: ImputationStrategy, optional :param numerical_imputation_strategy: missing values imputation strategy for numerical columns Possible values (non-forecasting scenario): - ImputationStrategy.MEAN - ImputationStrategy.MEDIAN (default) - ImputationStrategy.MOST_FREQUENT Possible values (forecasting scenario): - ImputationStrategy.MEAN - ImputationStrategy.MEDIAN - ImputationStrategy.BEST_OF_DEFAULT_IMPUTERS (default) - ImputationStrategy.VALUE - ImputationStrategy.FLATTEN_ITERATIVE - ImputationStrategy.LINEAR - ImputationStrategy.CUBIC - ImputationStrategy.PREVIOUS - ImputationStrategy.NEXT - ImputationStrategy.NO_IMPUTATION :param numerical_imputation_value: value for filling missing values if numerical_imputation_strategy is set to ImputationStrategy.VALUE, for forecasting only :type numerical_imputation_value: float, optional :param imputation_threshold: maximum threshold of missing values imputation, for forecasting only :type imputation_threshold: float, optional :param retrain_on_holdout: if True final pipelines will be train also on holdout data :type retrain_on_holdout: bool, optional :param categorical_columns: list of columns names that must be treated as categorical :type categorical_columns: list, optional :param numerical_columns: list of columns names that must be treated as numerical :type numerical_columns: list, optional :param sampling_type: type of sampling data for training, one of SamplingTypes enum values, default is SamplingTypes.FIRST_N_RECORDS, supported only for IBM Cloud Pak® for Data 4.0.1 and above :type sampling_type: str, optional :param sample_size_limit: the size of sample upper bound (in bytes). The default value is 1GB, supported only for IBM Cloud Pak® for Data 4.5 and above :type sample_size_limit: int, optional :param sample_rows_limit: the size of sample upper bound (in rows), supported only for IBM Cloud Pak® for Data 4.6 and above :type sample_rows_limit: int, optional :param sample_percentage_limit: the size of sample upper bound (as fraction of dataset size), supported only for IBM Cloud Pak® for Data 4.6 and above :type sample_percentage_limit: float, optional :param number_of_batch_rows: number of rows to read in each batch when reading from flight connection :type number_of_batch_rows: int, optional :param test_data_csv_separator: the separator, or list of separators to try for separating columns in a CSV user-defined holdout/test file, not used if the file_name is not a CSV file, default is ',' :type test_data_csv_separator: list[str] or str, optional :param test_data_excel_sheet: name of the excel sheet to use for user-defined holdout/test data, only use when xlsx file is an test, dataset file, by default first sheet is used :type test_data_excel_sheet: str or int, optional :param test_data_encoding: encoding type for CSV user-defined holdout/test file :type test_data_encoding: str, optional :param confidence_level: when the pipeline "PointwiseBoundedHoltWinters" or "PointwiseBoundedBATS" is used, the prediction interval is calculated at a given confidence_level to decide if a data record is an anomaly or not, optional for timeseries anomaly prediction :type confidence_level: float, optional :return: RemoteAutoPipelines or LocalAutoPipelines, depends on how you initialize the AutoAI object :rtype: RemoteAutoPipelines or LocalAutoPipelines **Examples** .. code-block:: python from ibm_watson_machine_learning.experiment import AutoAI experiment = AutoAI(...) fairness_info = { "protected_attributes": [ {"feature": "Sex", "reference_group": ['male'], "monitored_group": ['female']}, {"feature": "Age", "reference_group": [[50,60]], "monitored_group": [[18, 49]]} ], "favorable_labels": ["No Risk"], "unfavorable_labels": ["Risk"], } optimizer = experiment.optimizer( name="name of the optimizer.", prediction_type=AutoAI.PredictionType.BINARY, prediction_column="y", scoring=AutoAI.Metrics.ROC_AUC_SCORE, desc="Some description.", holdout_size=0.1, max_num_daub_ensembles=1, fairness_info= fairness_info, cognito_transform_names=[AutoAI.Transformers.SUM,AutoAI.Transformers.MAX], train_sample_rows_test_size=1, include_only_estimators=[AutoAI.ClassificationAlgorithms.LGBM, AutoAI.ClassificationAlgorithms.XGB], t_shirt_size=AutoAI.TShirtSize.L ) optimizer = experiment.optimizer( name="name of the optimizer.", prediction_type=AutoAI.PredictionType.MULTICLASS, prediction_column="y", scoring=AutoAI.Metrics.ROC_AUC_SCORE, desc="Some description.", ) """ # note: convert `timeseries` type to PredictionType.FORECASTING: if prediction_type == 'timeseries': prediction_type = PredictionType.FORECASTING if prediction_type != PredictionType.FORECASTING and retrain_on_holdout is None: retrain_on_holdout = True # Deprecation of excel_sheet as number: if isinstance(excel_sheet, int): warn( message="Support for excel sheet as number of the sheet (int) is deprecated! Please set excel sheet with name of the sheet.") if data_join_graph is not None: if self._workspace.wml_client.ICP_46: raise WMLClientError(Messages.get_message(message_id="obm_removal_message_cpd")) elif self._workspace.wml_client.ICP: print(Messages.get_message(message_id="obm_deprecation_message_cpd")) else: raise WMLClientError(Messages.get_message(message_id="obm_removal_message_cloud")) if prediction_type == PredictionType.FORECASTING and self._workspace.wml_client.ICP and \ (self._workspace.wml_client.wml_credentials['version'].startswith('2.5') or \ self._workspace.wml_client.wml_credentials['version'].startswith('3.0') or \ self._workspace.wml_client.wml_credentials['version'].startswith('3.5')): raise TSNotSupported() if prediction_type == PredictionType.TIMESERIES_ANOMALY_PREDICTION and self._workspace.wml_client.ICP and \ self._workspace.wml_client.wml_credentials['version'].startswith(('2.5', '3.0', '3.5', '4.0', '4.5', '4.6')): raise TSADNotSupported() if prediction_type in (PredictionType.FORECASTING, 'timeseries'): if not numerical_imputation_strategy and type(numerical_imputation_strategy) is not list: numerical_imputation_strategy = ImputationStrategy.BEST_OF_DEFAULT_IMPUTERS elif not numerical_imputation_strategy and type(numerical_imputation_strategy) is list: numerical_imputation_strategy = ImputationStrategy.NO_IMPUTATION if prediction_column is not None or prediction_columns is None: raise ForecastPredictionColumnsMissing() elif prediction_type == PredictionType.TIMESERIES_ANOMALY_PREDICTION: if feature_columns is None or prediction_column is not None or prediction_columns is not None: raise TimeseriesAnomalyPredictionFeatureColumnsMissing() if scoring is not None and scoring not in ( Metrics.F1_SCORE, Metrics.ACCURACY_SCORE, Metrics.BALANCED_ACCURACY, Metrics.PRECISION_SCORE, Metrics.RECALL_SCORE): raise TimeseriesAnomalyPredictionUnsupportedMetric(scoring) else: if prediction_column is None or prediction_columns is not None: raise NonForecastPredictionColumnMissing(prediction_type) if test_size: print('Note: Using `test_size` is deprecated. Use `holdout_size` instead.') if not holdout_size: holdout_size = test_size test_size = None if daub_include_only_estimators: print('Note: Using `daub_include_only_estimators` is deprecated. Use `include_only_estimators` instead.') if not include_only_estimators: include_only_estimators = daub_include_only_estimators daub_include_only_estimators = None if train_sample_rows_test_size and self._workspace.wml_client.ICP_46: print('Note: Using `train_sample_rows_test_size` is deprecated.' 'Use either `sample_rows_limit` or `sample_percentage_limit` instead.') if not sample_rows_limit and not sample_percentage_limit: if type(train_sample_rows_test_size) is float and train_sample_rows_test_size <= 1: print('Value of `train_sample_rows_test_size` parameter' 'will be passed as `sample_percentage_limit`.') sample_percentage_limit = train_sample_rows_test_size elif int(train_sample_rows_test_size) == train_sample_rows_test_size and train_sample_rows_test_size > 1: print('Value of `train_sample_rows_test_size` parameter' 'will be passed as `sample_rows_limit`.') sample_rows_limit = int(train_sample_rows_test_size) train_sample_rows_test_size = None elif sample_rows_limit or sample_percentage_limit: print('Parameter `train_sample_rows_test_size` will be ignored.') train_sample_rows_test_size = None def translate_str_imputation_param(x): if type(x) is list and prediction_type != PredictionType.FORECASTING: raise ImputationListNotSupported() if type(x) == str or (type(x) == list and type(x[0]) == str): return translate_imputation_string_strategy_to_enum(x, prediction_type) else: return x def translate_str_include_only_estimators_param(x): return [translate_estimator_string_to_enum(estimator) for estimator in x] def translate_str_include_batched_ensemble_estimators_param(x): return [translate_batched_estimator_string_to_enum(estimator) for estimator in x] def translate_str_pipeline_types_param(x): if prediction_type == PredictionType.TIMESERIES_ANOMALY_PREDICTION: return [TimeseriesAnomalyPredictionPipelineTypes(pipeline_type) for pipeline_type in x] else: return [ForecastingPipelineTypes(pipeline_type) for pipeline_type in x] categorical_imputation_strategy = translate_str_imputation_param(categorical_imputation_strategy) numerical_imputation_strategy = translate_str_imputation_param(numerical_imputation_strategy) include_only_estimators = translate_str_include_only_estimators_param(include_only_estimators) if include_only_estimators else None include_batched_ensemble_estimators = translate_str_include_batched_ensemble_estimators_param(include_batched_ensemble_estimators) if include_batched_ensemble_estimators else None pipeline_types = translate_str_pipeline_types_param(pipeline_types) if pipeline_types != None else None if include_batched_ensemble_estimators: for batched_estimator in include_batched_ensemble_estimators: basic_estimator_str = batched_estimator.value.split("(")[1][:-1] basic_estimator = translate_estimator_string_to_enum(basic_estimator_str) if include_only_estimators is not None and basic_estimator not in include_only_estimators: raise MissingEstimatorForExistingBatchedEstimator(batched_estimator, basic_estimator) validate_optimizer_enum_values( prediction_type=prediction_type, daub_include_only_estimators=daub_include_only_estimators, include_only_estimators=include_only_estimators, include_batched_ensemble_estimators=include_batched_ensemble_estimators, cognito_transform_names=cognito_transform_names, imputation_strategies=[x for y in list(filter(None, [categorical_imputation_strategy, numerical_imputation_strategy])) for x in (y if type(y) is list else [y])], scoring=scoring, t_shirt_size=kwargs.get("t_shirt_size", TShirtSize.M), is_cpd=self._workspace.wml_client.ICP ) if daub_give_priority_to_runtime is not None: if daub_give_priority_to_runtime < 0.0 or daub_give_priority_to_runtime > 5.0: raise ParamOutOfRange('daub_give_priority_to_runtime', daub_give_priority_to_runtime, 0.0, 5.0) if data_join_graph: data_join_graph.problem_type = prediction_type data_join_graph.target_column = prediction_column if (prediction_type == PredictionType.BINARY and scoring in vars(PositiveLabelClass).values() and positive_label is None): raise MissingPositiveLabel(scoring, reason=f"\"{scoring}\" needs a \"positive_label\" " f"parameter to be defined when used with binary classification.") if self._workspace is None and kwargs.get('t_shirt_size'): raise LocalInstanceButRemoteParameter( "t_shirt_size", reason="During LocalOptimizer initialization, \"t_shirt_size\" parameter was provided. " "\"t_shirt_size\" parameter is only applicable to the RemoteOptimizer instance." ) elif self._workspace is None: if prediction_type == PredictionType.FORECASTING: raise ForecastingCannotBeRunAsLocalScenario() if prediction_type == PredictionType.TIMESERIES_ANOMALY_PREDICTION: raise TimeseriesAnomalyPredictionCannotBeRunAsLocalScenario() reduced_kwargs = copy.copy(kwargs) for n in ['_force_local_scenario']: if n in reduced_kwargs: del reduced_kwargs[n] validate_additional_params_for_optimizer(reduced_kwargs) return LocalAutoPipelines( name=name, prediction_type='classification' if prediction_type in ['binary', 'multiclass'] else prediction_type, prediction_column=prediction_column, scoring=scoring, desc=desc, holdout_size=holdout_size, max_num_daub_ensembles=max_number_of_estimators, train_sample_rows_test_size=train_sample_rows_test_size, include_only_estimators=include_only_estimators, include_batched_ensemble_estimators=include_batched_ensemble_estimators, cognito_transform_names=cognito_transform_names, positive_label=positive_label, _force_local_scenario=kwargs.get('_force_local_scenario', False), **reduced_kwargs ) else: reduced_kwargs = copy.copy(kwargs) for n in ['t_shirt_size', 'notebooks', 'autoai_pod_version', 'obm_pod_version']: if n in reduced_kwargs: del reduced_kwargs[n] validate_additional_params_for_optimizer(reduced_kwargs) engine = WMLEngine(self._workspace) if self._20_class_limit_removal_test: engine._20_class_limit_removal_test = True optimizer = RemoteAutoPipelines( name=name, prediction_type=prediction_type, prediction_column=prediction_column, prediction_columns=prediction_columns, timestamp_column_name=timestamp_column_name, scoring=scoring, desc=desc, holdout_size=holdout_size, max_num_daub_ensembles=max_number_of_estimators, t_shirt_size=self._workspace.restrict_pod_size(t_shirt_size=kwargs.get( 't_shirt_size', TShirtSize.M if self._workspace.wml_client.ICP else TShirtSize.L) ), train_sample_rows_test_size=train_sample_rows_test_size, include_only_estimators=include_only_estimators, include_batched_ensemble_estimators=include_batched_ensemble_estimators, backtest_num=backtest_num, lookback_window=lookback_window, forecast_window=forecast_window, backtest_gap_length=backtest_gap_length, cognito_transform_names=cognito_transform_names, data_join_graph=data_join_graph, drop_duplicates=drop_duplicates, outliers_columns=outliers_columns, text_processing=text_processing, word2vec_feature_number=word2vec_feature_number, csv_separator=csv_separator, excel_sheet=excel_sheet, encoding=encoding, positive_label=positive_label, data_join_only=data_join_only, engine=engine, daub_give_priority_to_runtime=daub_give_priority_to_runtime, notebooks=kwargs.get('notebooks', True), autoai_pod_version=kwargs.get('autoai_pod_version', None), obm_pod_version=kwargs.get('obm_pod_version', None), fairness_info=fairness_info, sampling_type=sampling_type, sample_size_limit=sample_size_limit, sample_rows_limit=sample_rows_limit, sample_percentage_limit=sample_percentage_limit, number_of_batch_rows=number_of_batch_rows, n_parallel_data_connections=n_parallel_data_connections, categorical_imputation_strategy=categorical_imputation_strategy, numerical_imputation_strategy=numerical_imputation_strategy, numerical_imputation_value=numerical_imputation_value, imputation_threshold=imputation_threshold, retrain_on_holdout=retrain_on_holdout, numerical_columns=numerical_columns, categorical_columns=categorical_columns, feature_columns=feature_columns, pipeline_types=pipeline_types, supporting_features_at_forecast=supporting_features_at_forecast, test_data_csv_separator=test_data_csv_separator, test_data_excel_sheet=test_data_excel_sheet, test_data_encoding=test_data_encoding, confidence_level=confidence_level, **reduced_kwargs ) optimizer._workspace = self._workspace return optimizer
def _init_estimator_enums(self): if self._workspace and self._workspace.wml_client.ICP: self.ClassificationAlgorithms = ClassificationAlgorithmsCP4D self.RegressionAlgorithms = RegressionAlgorithmsCP4D self.ForecastingAlgorithms = ForecastingAlgorithmsCP4D else: self.ClassificationAlgorithms = ClassificationAlgorithms self.RegressionAlgorithms = RegressionAlgorithms self.ForecastingAlgorithms = ForecastingAlgorithms def _block_autoai_on_git_based_project(self): """Raises ForbiddenActionForGitBasedProject error for AutoAI experiments on git based project. It can be disabled by setting environment variable ENABLE_AUTOAI to 'true' """ from os import environ if self._workspace: if getattr(self._workspace.wml_client, 'project_type', None) == 'local_git_storage' \ and environ.get('ENABLE_AUTOAI', 'false').lower() == 'false': raise ForbiddenActionForGitBasedProject(reason="Creating AutoAI experiment is not supported for git based project.")