# For logspace definitions
import numpy as np
# Clustering definitions
from sklearn.cluster import (
AffinityPropagation,
Birch,
KMeans,
MiniBatchKMeans,
MeanShift,
)
# Classifier definitions
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import (
LogisticRegression,
PassiveAggressiveClassifier,
RidgeClassifier,
SGDClassifier,
)
from sklearn.ensemble import (
ExtraTreesClassifier,
RandomForestClassifier,
AdaBoostClassifier,
BaggingClassifier,
)
[docs]
def supervised_classifier_defs(settings_flag):
"""Establish supervised classifier definitions
which are non-specific to embeddor, and therefore,
non-specific to the natural language processing application
Parameters
----------
settings_flag : str
Either 'light', 'normal' or 'detailed'; a setting which
determines the number of hyperparameter combinations
tested during the grid search. For instance, a dataset
of 50 thousand samples may run for hours on the 'normal'
setting but for days on 'detailed'.
Returns
-------
search_space : dict
Hyperparameter instances for each clusterer
classifiers : dict
Contains sklearn classifiers instances
"""
if settings_flag == "light":
classifiers = {
"DecisionTreeClassifier": DecisionTreeClassifier(),
"LogisticRegression": LogisticRegression(),
"PassiveAggressiveClassifier": PassiveAggressiveClassifier(),
"RidgeClassifier": RidgeClassifier(),
"SGDClassifier": SGDClassifier(),
"ExtraTreesClassifier": ExtraTreesClassifier(),
"RandomForestClassifier": RandomForestClassifier(),
"BaggingClassifier": BaggingClassifier(),
"AdaBoostClassifier": AdaBoostClassifier(),
}
else:
classifiers = {
"LinearSVC": LinearSVC(),
"SVC": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
"MLPClassifier": MLPClassifier(),
"LogisticRegression": LogisticRegression(),
"PassiveAggressiveClassifier": PassiveAggressiveClassifier(),
"RidgeClassifier": RidgeClassifier(),
"SGDClassifier": SGDClassifier(),
"ExtraTreesClassifier": ExtraTreesClassifier(),
"RandomForestClassifier": RandomForestClassifier(),
"BaggingClassifier": BaggingClassifier(),
"AdaBoostClassifier": AdaBoostClassifier(),
}
if settings_flag == "light":
search_space = {
"DecisionTreeClassifier": {
"clf__criterion": ["gini"],
"clf__splitter": ["best"],
"clf__min_samples_split": [2],
"clf__min_samples_leaf": [1],
},
"LogisticRegression": {
"clf__solver": ["newton-cg", "lbfgs", "sag"],
"clf__C": np.logspace(0, 4, 10),
},
"PassiveAggressiveClassifier": {
"clf__C": [0.001, 0.01, 0.1, 1.0],
"clf__loss": ["hinge", "squared_hinge"],
},
"RidgeClassifier": {
"clf__alpha": [0.0, 1e-3, 1.0],
},
"SGDClassifier": {
"clf__loss": ["squared_hinge"],
"clf__alpha": [1e-3, 1e-2],
},
"ExtraTreesClassifier": {
"clf__n_estimators": [200, 500],
"clf__criterion": ["gini"],
"clf__min_samples_split": [2],
"clf__min_samples_leaf": [1],
},
"RandomForestClassifier": {
"clf__n_estimators": [200, 500],
"clf__criterion": ["gini"],
"clf__min_samples_split": [2],
"clf__min_samples_leaf": [1],
},
"BaggingClassifier": {
"clf__n_estimators": [30, 50, 100],
"clf__max_samples": [1.0, 0.8],
},
"AdaBoostClassifier": {
"clf__n_estimators": [50, 100],
"clf__learning_rate": [1.0, 0.9, 0.8],
"clf__algorithm": ["SAMME"],
},
}
elif settings_flag == "normal":
search_space = {
"LinearSVC": {
"clf__C": [1e-2, 1e-1],
"clf__max_iter": [800, 1000],
},
"SVC": {
"clf__C": [1.0],
"clf__gamma": [0.5, 0.1, 0.01],
"clf__kernel": ["rbf"],
},
"DecisionTreeClassifier": {
"clf__criterion": ["gini"],
"clf__splitter": ["best"],
"clf__min_samples_split": [2],
"clf__min_samples_leaf": [1],
},
"MLPClassifier": {
"clf__hidden_layer_sizes": [(100,)],
"clf__solver": ["adam"],
"clf__alpha": [1e-2],
"clf__batch_size": ["auto"],
"clf__learning_rate": ["adaptive"],
"clf__max_iter": [1000],
},
"LogisticRegression": {
"clf__solver": ["newton-cg", "lbfgs", "sag"],
"clf__C": np.logspace(0, 4, 10),
},
"PassiveAggressiveClassifier": {
"clf__C": [0.001, 0.01, 0.1, 1.0],
"clf__loss": ["hinge", "squared_hinge"],
},
"RidgeClassifier": {
"clf__alpha": [0.0, 1e-3, 1.0],
# "clf__normalize": [False, True],
},
"SGDClassifier": {
"clf__loss": ["squared_hinge"],
"clf__alpha": [1e-3, 1e-2],
},
"ExtraTreesClassifier": {
"clf__n_estimators": [200, 500],
"clf__criterion": ["gini"],
"clf__min_samples_split": [2],
"clf__min_samples_leaf": [1],
},
"RandomForestClassifier": {
"clf__n_estimators": [200, 500],
"clf__criterion": ["gini"],
"clf__min_samples_split": [2],
"clf__min_samples_leaf": [1],
},
"BaggingClassifier": {
"clf__n_estimators": [30, 50, 100],
"clf__max_samples": [1.0, 0.8],
},
"AdaBoostClassifier": {
"clf__n_estimators": [50, 100],
"clf__learning_rate": [1.0, 0.9, 0.8],
"clf__algorithm": ["SAMME"],
},
}
elif settings_flag == "detailed":
search_space = {
"LinearSVC": {
"clf__C": [1e-2, 1e-1, 1, 1e1, 1e2, 1e3],
"clf__max_iter": [800, 1000, 1200, 1500, 2000],
},
"SVC": {
"clf__C": [1.0, 1e-2, 1e-1, 1, 1e1],
"clf__gamma": [0.5, 0.1, 0.01, 0.001, 0.0001],
"clf__kernel": ["rbf", "linear", "sigmoid", "poly"],
},
"DecisionTreeClassifier": {
"clf__criterion": ["gini", "entropy"],
"clf__splitter": ["best", "random"],
"clf__min_samples_split": [2, 3, 4],
"clf__min_samples_leaf": [1, 2, 3],
},
"MLPClassifier": {
"clf__hidden_layer_sizes": [(100,), (100, 64), (100, 64, 16)],
"clf__solver": ["adam", "lbfgs", "sgd", "adam"],
"clf__alpha": [1e-2, 1e-3],
"clf__batch_size": ["auto"],
"clf__learning_rate": ["adaptive", "invscaling", "constant"],
"clf__max_iter": [1000],
},
"LogisticRegression": {
"clf__solver": ["newton-cg", "lbfgs", "sag"],
"clf__C": np.logspace(0, 4, 10),
},
"PassiveAggressiveClassifier": {
"clf__C": [0.001, 0.01, 0.1, 1.0],
"clf__loss": ["hinge", "squared_hinge"],
},
"RidgeClassifier": {
"clf__alpha": [0.0, 1e-3, 1.0, 1e-4, 1e-3, 1e-2, 1e-1, 1.0],
# "clf__normalize": [False, True],
},
"SGDClassifier": {
"clf__loss": ["squared_hinge", "hinge", "log"],
"clf__alpha": [1e-3, 1e-2],
},
"ExtraTreesClassifier": {
"clf__n_estimators": [200, 500],
"clf__criterion": ["gini", "entropy"],
"clf__min_samples_split": [2, 3, 4],
"clf__min_samples_leaf": [1, 2, 3],
},
"RandomForestClassifier": {
"clf__n_estimators": [200, 500],
"clf__criterion": ["gini", "entropy"],
"clf__min_samples_split": [2, 3, 4],
"clf__min_samples_leaf": [1, 2, 3],
},
"BaggingClassifier": {
"clf__n_estimators": [10, 30, 50, 100, 200],
"clf__max_samples": [1.0, 0.8, 0.4, 0.2],
},
"AdaBoostClassifier": {
"clf__n_estimators": [30, 50, 100, 150, 300],
"clf__learning_rate": [1.0, 0.9, 0.8, 0.4],
"clf__algorithm": ["SAMME"],
},
}
return search_space, classifiers
[docs]
def unsupervised_classifier_defs(setting_flag, n_clusters):
"""Establish supervised classifier definitions which are
non-specific to embeddor, and therefore, non-specific to
the natural language processing application
Parameters
----------
setting_flag : str
Either 'normal' or 'detailed'; a setting which determines
the number of hyperparameter combinations tested during
the grid search. For instance, a dataset of 50,000 samples
may run for hours on the 'normal' setting but for days
on 'detailed'.
n_clusters : int,
Number of clusters to organize the text data into. Usually
set to the number of unique categories within data.
Returns
-------
search_space : dict
Hyperparameter instances for each clusterer
clusterers : dict
Contains sklearn cluster instances
"""
clusterers = {
"AffinityPropagation": AffinityPropagation(),
"Birch": Birch(),
"KMeans": KMeans(),
"MiniBatchKMeans": MiniBatchKMeans(),
"MeanShift": MeanShift(),
}
if setting_flag == "normal":
search_space = {
"AffinityPropagation": {
"clf__damping": [0.5, 0.9],
"clf__max_iter": [200, 600],
},
"Birch": {
"clf__threshold": [0.5, 0.75, 1.0],
"clf__n_clusters": [n_clusters],
"clf__branching_factor": [50, 100],
},
"KMeans": {
"clf__n_clusters": [n_clusters],
"clf__init": ["k-means++", "random"],
"clf__n_init": [10, 50, 100],
},
"MiniBatchKMeans": {
"clf__n_clusters": [n_clusters],
"clf__init": ["k-means++", "random"],
"clf__n_init": [3, 10, 20],
},
"MeanShift": {
"clf__bandwidth": [None],
"clf__bin_seeding": [False, True],
"clf__max_iter": [300, 600],
},
}
if setting_flag == "detailed":
search_space = {
"AffinityPropagation": {
"clf__damping": [0.5, 0.75, 0.9],
"clf__max_iter": [200, 600, 800, 1000, 1200],
},
"Birch": {
"clf__threshold": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
"clf__n_clusters": [n_clusters],
"clf__branching_factor": [25, 50, 100, 200],
},
"KMeans": {
"clf__n_clusters": [n_clusters],
"clf__init": ["k-means++", "random"],
"clf__n_init": [10, 50, 100],
"clf__max_iter": [300, 600],
},
"MiniBatchKMeans": {
"clf__n_clusters": [n_clusters],
"clf__init": ["k-means++", "random"],
"clf__n_init": [3, 10, 20],
"clf__max_iter": [100, 300],
},
"MeanShift": {
"clf__bandwidth": [None],
"clf__bin_seeding": [False, True],
"clf__max_iter": [300, 600, 1000],
},
}
return search_space, clusterers