Scikit-learn Cheatsheet
Installation
| Platform | Command |
|---|
| pip (all platforms) | pip install scikit-learn |
| conda | conda install scikit-learn |
| Ubuntu/Debian | sudo apt-get install python3-sklearn |
| macOS (Homebrew) | brew install python && pip3 install scikit-learn |
| Specific version | pip install scikit-learn==1.3.0 |
| With dependencies | pip install scikit-learn numpy scipy matplotlib pandas |
| Virtual environment | python -m venv env && source env/bin/activate && pip install scikit-learn |
| Verify installation | python -c "import sklearn; print(sklearn.__version__)" |
Basic Commands - Data Loading & Splitting
| Command | Description |
|---|
from sklearn import datasets | Import datasets module |
iris = datasets.load_iris() | Load iris dataset |
X, y = iris.data, iris.target | Extract features and labels |
datasets.load_digits() | Load handwritten digits dataset |
datasets.load_wine() | Load wine classification dataset |
datasets.load_breast_cancer() | Load breast cancer dataset |
from sklearn.model_selection import train_test_split | Import train/test split function |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | Split data 80/20 train/test |
train_test_split(X, y, test_size=0.3, random_state=42) | Split with fixed random seed |
train_test_split(X, y, stratify=y) | Stratified split (preserves class distribution) |
Basic Commands - Preprocessing
| Command | Description |
|---|
from sklearn.preprocessing import StandardScaler | Import standardization scaler |
scaler = StandardScaler() | Create scaler instance |
scaler.fit(X_train) | Compute mean and std from training data |
X_scaled = scaler.transform(X_train) | Apply scaling transformation |
X_scaled = scaler.fit_transform(X_train) | Fit and transform in one step |
from sklearn.preprocessing import MinMaxScaler | Import min-max normalizer |
normalizer = MinMaxScaler(feature_range=(0, 1)) | Create normalizer for 0-1 range |
from sklearn.preprocessing import LabelEncoder | Import label encoder |
encoder = LabelEncoder() | Create encoder instance |
y_encoded = encoder.fit_transform(y) | Encode categorical labels to integers |
encoder.inverse_transform(y_encoded) | Decode integers back to original labels |
from sklearn.preprocessing import OneHotEncoder | Import one-hot encoder |
OneHotEncoder(sparse=False).fit_transform(X) | Create dummy variables from categories |
Basic Commands - Classification
| Command | Description |
|---|
from sklearn.linear_model import LogisticRegression | Import logistic regression |
model = LogisticRegression() | Create logistic regression model |
model.fit(X_train, y_train) | Train model on training data |
predictions = model.predict(X_test) | Make predictions on test data |
probabilities = model.predict_proba(X_test) | Get prediction probabilities |
from sklearn.tree import DecisionTreeClassifier | Import decision tree classifier |
tree = DecisionTreeClassifier(max_depth=5) | Create decision tree with max depth |
from sklearn.neighbors import KNeighborsClassifier | Import K-nearest neighbors |
knn = KNeighborsClassifier(n_neighbors=5) | Create KNN with 5 neighbors |
from sklearn.naive_bayes import GaussianNB | Import Gaussian Naive Bayes |
nb = GaussianNB() | Create Naive Bayes classifier |
Basic Commands - Regression
| Command | Description |
|---|
from sklearn.linear_model import LinearRegression | Import linear regression |
model = LinearRegression() | Create linear regression model |
model.fit(X_train, y_train) | Train regression model |
predictions = model.predict(X_test) | Predict continuous values |
model.coef_ | Access model coefficients |
model.intercept_ | Access model intercept |
from sklearn.linear_model import Ridge | Import Ridge regression (L2) |
ridge = Ridge(alpha=1.0) | Create Ridge with regularization strength |
from sklearn.linear_model import Lasso | Import Lasso regression (L1) |
lasso = Lasso(alpha=0.1) | Create Lasso with regularization |
Basic Commands - Model Evaluation
| Command | Description |
|---|
from sklearn.metrics import accuracy_score | Import accuracy metric |
accuracy_score(y_test, predictions) | Calculate accuracy percentage |
from sklearn.metrics import classification_report | Import detailed classification metrics |
classification_report(y_test, predictions) | Get precision, recall, F1-score |
from sklearn.metrics import confusion_matrix | Import confusion matrix |
confusion_matrix(y_test, predictions) | Create confusion matrix |
from sklearn.metrics import mean_squared_error | Import MSE metric |
mean_squared_error(y_test, predictions) | Calculate mean squared error |
mean_squared_error(y_test, predictions, squared=False) | Calculate RMSE |
from sklearn.metrics import r2_score | Import R² metric |
r2_score(y_test, predictions) | Calculate R² score |
model.score(X_test, y_test) | Get default score (accuracy or R²) |
Basic Commands - Cross-Validation
| Command | Description |
|---|
from sklearn.model_selection import cross_val_score | Import cross-validation |
cross_val_score(model, X, y, cv=5) | Perform 5-fold cross-validation |
scores = cross_val_score(model, X, y, cv=10) | 10-fold cross-validation |
scores.mean() | Get mean cross-validation score |
scores.std() | Get standard deviation of scores |
from sklearn.model_selection import cross_validate | Import multi-metric CV |
cross_validate(model, X, y, cv=5, scoring=['accuracy', 'precision']) | CV with multiple metrics |
from sklearn.model_selection import KFold | Import K-fold splitter |
kfold = KFold(n_splits=5, shuffle=True, random_state=42) | Create K-fold object |
Advanced Usage - Ensemble Methods
| Command | Description |
|---|
from sklearn.ensemble import RandomForestClassifier | Import random forest |
rf = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1) | Create random forest with 100 trees |
rf.feature_importances_ | Get feature importance scores |
from sklearn.ensemble import GradientBoostingClassifier | Import gradient boosting |
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) | Create gradient boosting model |
from sklearn.ensemble import AdaBoostClassifier | Import AdaBoost |
ada = AdaBoostClassifier(n_estimators=50, learning_rate=1.0) | Create AdaBoost classifier |
from sklearn.ensemble import VotingClassifier | Import voting ensemble |
VotingClassifier(estimators=[('lr', model1), ('rf', model2)], voting='soft') | Combine multiple models |
from sklearn.ensemble import BaggingClassifier | Import bagging ensemble |
BaggingClassifier(base_estimator=tree, n_estimators=10) | Create bagging ensemble |
from sklearn.ensemble import StackingClassifier | Import stacking ensemble |
StackingClassifier(estimators=[...], final_estimator=LogisticRegression()) | Stack models with meta-learner |
Advanced Usage - Support Vector Machines
| Command | Description |
|---|
from sklearn.svm import SVC | Import SVM classifier |
svm = SVC(kernel='rbf', C=1.0, gamma='scale') | Create RBF kernel SVM |
SVC(kernel='linear') | Linear kernel SVM |
SVC(kernel='poly', degree=3) | Polynomial kernel SVM |
SVC(probability=True) | Enable probability estimates |
from sklearn.svm import SVR | Import SVM regressor |
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1) | Create SVM regressor |
from sklearn.svm import LinearSVC | Import linear SVM (faster) |
LinearSVC(C=1.0, max_iter=1000) | Linear SVM for large datasets |
Advanced Usage - Clustering
| Command | Description |
|---|
from sklearn.cluster import KMeans | Import K-means clustering |
kmeans = KMeans(n_clusters=3, random_state=42) | Create K-means with 3 clusters |
clusters = kmeans.fit_predict(X) | Fit and get cluster labels |
kmeans.cluster_centers_ | Get cluster centroids |
kmeans.inertia_ | Get within-cluster sum of squares |
from sklearn.cluster import DBSCAN | Import DBSCAN clustering |
DBSCAN(eps=0.5, min_samples=5).fit_predict(X) | Density-based clustering |
from sklearn.cluster import AgglomerativeClustering | Import hierarchical clustering |
AgglomerativeClustering(n_clusters=3, linkage='ward') | Hierarchical clustering |
from sklearn.cluster import MeanShift | Import mean shift clustering |
MeanShift(bandwidth=2.0).fit_predict(X) | Mean shift clustering |
Advanced Usage - Dimensionality Reduction
| Command | Description |
|---|
from sklearn.decomposition import PCA | Import PCA |
pca = PCA(n_components=2) | Create PCA with 2 components |
X_pca = pca.fit_transform(X) | Transform to principal components |
pca.explained_variance_ratio_ | Get variance explained by each component |
pca.components_ | Get principal component vectors |
PCA(n_components=0.95) | Keep components explaining 95% variance |
from sklearn.manifold import TSNE | Import t-SNE |
TSNE(n_components=2, perplexity=30).fit_transform(X) | t-SNE dimensionality reduction |
from sklearn.decomposition import TruncatedSVD | Import truncated SVD |
TruncatedSVD(n_components=100).fit_transform(X) | SVD for sparse matrices |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | Import LDA |
LDA(n_components=2).fit_transform(X, y) | Supervised dimensionality reduction |
Advanced Usage - Pipeline Construction
| Command | Description |
|---|
from sklearn.pipeline import Pipeline | Import pipeline |
Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())]) | Create processing pipeline |
pipeline.fit(X_train, y_train) | Fit entire pipeline |
pipeline.predict(X_test) | Predict using pipeline |
from sklearn.pipeline import make_pipeline | Import pipeline maker |
make_pipeline(StandardScaler(), PCA(10), LogisticRegression()) | Auto-name pipeline steps |
pipeline.named_steps['scaler'] | Access specific pipeline step |
from sklearn.compose import ColumnTransformer | Import column transformer |
ColumnTransformer([('num', StandardScaler(), [0,1]), ('cat', OneHotEncoder(), [2])]) | Different preprocessing per column |
pipeline.get_params() | Get all pipeline parameters |
pipeline.set_params(clf__C=0.1) | Set nested pipeline parameters |
Advanced Usage - Hyperparameter Tuning
| Command | Description |
|---|
from sklearn.model_selection import GridSearchCV | Import grid search |
GridSearchCV(model, param_grid, cv=5, n_jobs=-1) | Exhaustive parameter search |
grid_search.fit(X_train, y_train) | Run grid search |
grid_search.best_params_ | Get best parameters found |
grid_search.best_score_ | Get best cross-validation score |
grid_search.best_estimator_ | Get best model |
grid_search.cv_results_ | Get detailed CV results |
from sklearn.model_selection import RandomizedSearchCV | Import randomized search |
RandomizedSearchCV(model, param_distributions, n_iter=100, cv=5) | Random parameter sampling |
from scipy.stats import randint, uniform | Import distributions for random search |
param_distributions = {'n_estimators': randint(50, 200)} | Define parameter distribution |
Advanced Usage - Feature Selection
| Command | Description |
|---|
from sklearn.feature_selection import SelectKBest | Import K-best selector |
SelectKBest(k=10).fit_transform(X, y) | Select top 10 features |
from sklearn.feature_selection import chi2, f_classif | Import scoring functions |
SelectKBest(score_func=chi2, k=5) | Chi-squared feature selection |
from sklearn.feature_selection import RFE | Import recursive feature elimination |
RFE(estimator=model, n_features_to_select=10).fit(X, y) | Recursive feature elimination |
from sklearn.feature_selection import SelectFromModel | Import model-based selection |
SelectFromModel(RandomForestClassifier()).fit(X, y) | Select features by importance |
selector.get_support() | Get boolean mask of selected features |
selector.transform(X) | Transform to selected features only |
Advanced Usage - Model Persistence
| Command | Description |
|---|
import joblib | Import joblib for model saving |
joblib.dump(model, 'model.pkl') | Save model to file |
model = joblib.load('model.pkl') | Load model from file |
import pickle | Import pickle module |
pickle.dump(model, open('model.pkl', 'wb')) | Save with pickle |
model = pickle.load(open('model.pkl', 'rb')) | Load with pickle |
joblib.dump(model, 'model.pkl', compress=3) | Save with compression |
Configuration
Model Parameters Configuration
# Logistic Regression parameters
LogisticRegression(
penalty='l2', # Regularization type: 'l1', 'l2', 'elasticnet', 'none'
C=1.0, # Inverse regularization strength (smaller = stronger)
solver='lbfgs', # Algorithm: 'lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'
max_iter=100, # Maximum iterations
random_state=42, # Random seed for reproducibility
n_jobs=-1 # Use all CPU cores
)
# Random Forest parameters
RandomForestClassifier(
n_estimators=100, # Number of trees
criterion='gini', # Split quality: 'gini' or 'entropy'
max_depth=None, # Maximum tree depth (None = unlimited)
min_samples_split=2, # Minimum samples to split node
min_samples_leaf=1, # Minimum samples in leaf
max_features='sqrt', # Features per split: 'sqrt', 'log2', int, float
bootstrap=True, # Bootstrap sampling
oob_score=False, # Out-of-bag score estimation
n_jobs=-1, # Parallel jobs
random_state=42,
class_weight='balanced' # Handle imbalanced classes
)
# Support Vector Machine parameters
SVC(
C=1.0, # Regularization parameter
kernel='rbf', # Kernel: 'linear', 'poly', 'rbf', 'sigmoid'
degree=3, # Polynomial degree (for 'poly' kernel)
gamma='scale', # Kernel coefficient: 'scale', 'auto', or float
coef0=0.0, # Independent term in kernel
probability=False, # Enable probability estimates (slower)
cache_size=200, # Kernel cache size (MB)
class_weight=None, # Class weights
max_iter=-1 # Iteration limit (-1 = no limit)
)
# Gradient Boosting parameters
GradientBoostingClassifier(
loss='log_loss', # Loss function
learning_rate=0.1, # Shrinks contribution of each tree
n_estimators=100, # Number of boosting stages
subsample=1.0, # Fraction of samples for fitting
criterion='friedman_mse', # Split quality measure
min_samples_split=2,
min_samples_leaf=1,
max_depth=3, # Maximum tree depth
max_features=None, # Features per split
validation_fraction=0.1, # Fraction for early stopping
n_iter_no_change=None, # Early stopping rounds
random_state=42
)
Pipeline Configuration
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
# Complete pipeline with preprocessing
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=0.95)),
('classifier', RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
))
])
# Pipeline with column-specific preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['age', 'income', 'score']),
('cat', OneHotEncoder(drop='first'), ['category', 'region'])
],
remainder='passthrough' # Keep other columns unchanged
)
full_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
Cross-Validation Configuration
from sklearn.model_selection import cross_validate
# Multi-metric cross-validation
cv_results = cross_validate(
estimator=model,
X=X,
y=y,
cv=5, # Number of folds
scoring={
'accuracy': 'accuracy',
'precision': 'precision_weighted',
'recall': 'recall_weighted',
'f1': 'f1_weighted',
'roc_auc': 'roc_auc_ovr'
},
return_train_score=True, # Include training scores
return_estimator=True, # Return fitted estimators
n_jobs=-1, # Parallel processing
verbose=1 # Progress messages
)
Grid Search Configuration
from sklearn.model_selection import GridSearchCV
# Comprehensive parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2'],
'bootstrap': [True, False]
}
grid_search = GridSearchCV(
estimator=RandomForestClassifier(),
param_grid=param_grid,
scoring='f1_weighted', # Scoring metric
cv=5, # Cross-validation folds
n_jobs=-1, # Use all cores
verbose=2, # Verbosity level
refit=True, # Refit best model on full data
return_train_score=True, # Return training scores
error_score='raise' # How to handle errors
)
Common Use Cases
Use Case 1: Binary Classification with Imbalanced Data
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
# Generate imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
n_redundant=5, weights=[0.9, 0.1], random_state=42)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model with class balancing
model = RandomForestClassifier(
n_estimators=100,
class_weight='balanced', # Handle imbalance
random_state=42
)
model.fit(X_train_scaled, y_train)
# Evaluate
predictions = model.predict(X_test_scaled)
probabilities = model.predict_proba(X_test_scaled)[:, 1]
print(classification_report(y_test, predictions))
print(f"ROC-AUC Score: {roc_auc_score(y_test, probabilities):.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
Use Case 2: Multi-Class Text Classification Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Sample text data
texts = [
"Python is great for machine learning",
"Java is used for enterprise applications",
"JavaScript powers web development",
# ... more texts
]
labels = ['tech', 'tech', 'tech'] # Categories
# Create text classification pipeline
text_clf = Pipeline([
('tfidf', TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2),
stop_words='english'