__HTML_TAG_163_ Tutti i comandi di fantascienza_HTML_TAG_164__
Scikit-learn Cheatsheet¶
Installazione¶
Tabella_169
Comandi di base - Caricamento dati & Spalato¶
Tabella_170_
Comandi di base - Preprocessing¶
TABELLA 171_
Comandi di base - Classificazione¶
Tabella_172_
Comandi di base - Regressione¶
TABELLA 173_
Comandi di base - Valutazione del modello¶
Tabella_174_
Comandi di base - Valutazione trasversale¶
Tabella_175_
Uso avanzato - Metodi di ensemble¶
Tabella_176_
Uso avanzato - Support Vector Machines¶
Tabella_177_
Uso avanzato - Clustering¶
Tabella_178_
Uso avanzato - Riduzione della dimensione¶
Tabella_179_
Uso avanzato - Pipeline Construction¶
Tabella_180_
Uso avanzato - Hyperparameter Tuning¶
Tabella_181_
Uso avanzato - Selezione delle caratteristiche¶
Tabella_182_
Uso avanzato - Model Persistenza¶
Tabella_183_
Configurazione¶
Parametri modello Configurazione¶
# Logistic Regression parameters
LogisticRegression(
penalty='l2', # Regularization type: 'l1', 'l2', 'elasticnet', 'none'
C=1.0, # Inverse regularization strength (smaller = stronger)
solver='lbfgs', # Algorithm: 'lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'
max_iter=100, # Maximum iterations
random_state=42, # Random seed for reproducibility
n_jobs=-1 # Use all CPU cores
)
# Random Forest parameters
RandomForestClassifier(
n_estimators=100, # Number of trees
criterion='gini', # Split quality: 'gini' or 'entropy'
max_depth=None, # Maximum tree depth (None = unlimited)
min_samples_split=2, # Minimum samples to split node
min_samples_leaf=1, # Minimum samples in leaf
max_features='sqrt', # Features per split: 'sqrt', 'log2', int, float
bootstrap=True, # Bootstrap sampling
oob_score=False, # Out-of-bag score estimation
n_jobs=-1, # Parallel jobs
random_state=42,
class_weight='balanced' # Handle imbalanced classes
)
# Support Vector Machine parameters
SVC(
C=1.0, # Regularization parameter
kernel='rbf', # Kernel: 'linear', 'poly', 'rbf', 'sigmoid'
degree=3, # Polynomial degree (for 'poly' kernel)
gamma='scale', # Kernel coefficient: 'scale', 'auto', or float
coef0=0.0, # Independent term in kernel
probability=False, # Enable probability estimates (slower)
cache_size=200, # Kernel cache size (MB)
class_weight=None, # Class weights
max_iter=-1 # Iteration limit (-1 = no limit)
)
# Gradient Boosting parameters
GradientBoostingClassifier(
loss='log_loss', # Loss function
learning_rate=0.1, # Shrinks contribution of each tree
n_estimators=100, # Number of boosting stages
subsample=1.0, # Fraction of samples for fitting
criterion='friedman_mse', # Split quality measure
min_samples_split=2,
min_samples_leaf=1,
max_depth=3, # Maximum tree depth
max_features=None, # Features per split
validation_fraction=0.1, # Fraction for early stopping
n_iter_no_change=None, # Early stopping rounds
random_state=42
)
Configurazione della tubazione¶
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
# Complete pipeline with preprocessing
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=0.95)),
('classifier', RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
))
])
# Pipeline with column-specific preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['age', 'income', 'score']),
('cat', OneHotEncoder(drop='first'), ['category', 'region'])
],
remainder='passthrough' # Keep other columns unchanged
)
full_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
Cross-Validation Configuration¶
from sklearn.model_selection import cross_validate
# Multi-metric cross-validation
cv_results = cross_validate(
estimator=model,
X=X,
y=y,
cv=5, # Number of folds
scoring={
'accuracy': 'accuracy',
'precision': 'precision_weighted',
'recall': 'recall_weighted',
'f1': 'f1_weighted',
'roc_auc': 'roc_auc_ovr'
},
return_train_score=True, # Include training scores
return_estimator=True, # Return fitted estimators
n_jobs=-1, # Parallel processing
verbose=1 # Progress messages
)
Configurazione di ricerca Grid¶
from sklearn.model_selection import GridSearchCV
# Comprehensive parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2'],
'bootstrap': [True, False]
}
grid_search = GridSearchCV(
estimator=RandomForestClassifier(),
param_grid=param_grid,
scoring='f1_weighted', # Scoring metric
cv=5, # Cross-validation folds
n_jobs=-1, # Use all cores
verbose=2, # Verbosity level
refit=True, # Refit best model on full data
return_train_score=True, # Return training scores
error_score='raise' # How to handle errors
)
Common Use Cases¶
Use Case 1: Classificazione binaria con i dati imbarcati¶
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
# Generate imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
n_redundant=5, weights=[0.9, 0.1], random_state=42)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model with class balancing
model = RandomForestClassifier(
n_estimators=100,
class_weight='balanced', # Handle imbalance
random_state=42
)
model.fit(X_train_scaled, y_train)
# Evaluate
predictions = model.predict(X_test_scaled)
probabilities = model.predict_proba(X_test_scaled)[:, 1]
print(classification_report(y_test, predictions))
print(f"ROC-AUC Score: {roc_auc_score(y_test, probabilities):.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
Use Case 2: Classificazione di testo multi-classe Pipeline¶
``python' da sklearn.feature_extraction.text import TfidfVectorizer da sklearn. pipeline import Pipeline da sklearn.linear_model import LogisticRegression da sklearn.model_selection import cross_val_score
Esempio di dati di testo¶
testo = "Python è grande per l'apprendimento automatico", "Java è utilizzato per applicazioni aziendali", "JavaScript Power Web development",
... più testi¶
] etichetta = ['tech', 'tech', 'tech'] # Categorie
Creare pipeline di classificazione del testo¶
text_clf = Pipeline([ ('tfidf', TfidfVectorizer( max_features=5000, ngram_range=(1, 2), stop_words='english '