Skip to content

Scikit-learn Cheatsheet

Installation

Platform Command
pip (all platforms) pip install scikit-learn
conda conda install scikit-learn
Ubuntu/Debian sudo apt-get install python3-sklearn
macOS (Homebrew) brew install python && pip3 install scikit-learn
Specific version pip install scikit-learn==1.3.0
With dependencies pip install scikit-learn numpy scipy matplotlib pandas
Virtual environment python -m venv env && source env/bin/activate && pip install scikit-learn
Verify installation python -c "import sklearn; print(sklearn.__version__)"

Basic Commands - Data Loading & Splitting

Command Description
from sklearn import datasets Import datasets module
iris = datasets.load_iris() Load iris dataset
X, y = iris.data, iris.target Extract features and labels
datasets.load_digits() Load handwritten digits dataset
datasets.load_wine() Load wine classification dataset
datasets.load_breast_cancer() Load breast cancer dataset
from sklearn.model_selection import train_test_split Import train/test split function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) Split data 80/20 train/test
train_test_split(X, y, test_size=0.3, random_state=42) Split with fixed random seed
train_test_split(X, y, stratify=y) Stratified split (preserves class distribution)

Basic Commands - Preprocessing

Command Description
from sklearn.preprocessing import StandardScaler Import standardization scaler
scaler = StandardScaler() Create scaler instance
scaler.fit(X_train) Compute mean and std from training data
X_scaled = scaler.transform(X_train) Apply scaling transformation
X_scaled = scaler.fit_transform(X_train) Fit and transform in one step
from sklearn.preprocessing import MinMaxScaler Import min-max normalizer
normalizer = MinMaxScaler(feature_range=(0, 1)) Create normalizer for 0-1 range
from sklearn.preprocessing import LabelEncoder Import label encoder
encoder = LabelEncoder() Create encoder instance
y_encoded = encoder.fit_transform(y) Encode categorical labels to integers
encoder.inverse_transform(y_encoded) Decode integers back to original labels
from sklearn.preprocessing import OneHotEncoder Import one-hot encoder
OneHotEncoder(sparse=False).fit_transform(X) Create dummy variables from categories

Basic Commands - Classification

Command Description
from sklearn.linear_model import LogisticRegression Import logistic regression
model = LogisticRegression() Create logistic regression model
model.fit(X_train, y_train) Train model on training data
predictions = model.predict(X_test) Make predictions on test data
probabilities = model.predict_proba(X_test) Get prediction probabilities
from sklearn.tree import DecisionTreeClassifier Import decision tree classifier
tree = DecisionTreeClassifier(max_depth=5) Create decision tree with max depth
from sklearn.neighbors import KNeighborsClassifier Import K-nearest neighbors
knn = KNeighborsClassifier(n_neighbors=5) Create KNN with 5 neighbors
from sklearn.naive_bayes import GaussianNB Import Gaussian Naive Bayes
nb = GaussianNB() Create Naive Bayes classifier

Basic Commands - Regression

Command Description
from sklearn.linear_model import LinearRegression Import linear regression
model = LinearRegression() Create linear regression model
model.fit(X_train, y_train) Train regression model
predictions = model.predict(X_test) Predict continuous values
model.coef_ Access model coefficients
model.intercept_ Access model intercept
from sklearn.linear_model import Ridge Import Ridge regression (L2)
ridge = Ridge(alpha=1.0) Create Ridge with regularization strength
from sklearn.linear_model import Lasso Import Lasso regression (L1)
lasso = Lasso(alpha=0.1) Create Lasso with regularization

Basic Commands - Model Evaluation

Command Description
from sklearn.metrics import accuracy_score Import accuracy metric
accuracy_score(y_test, predictions) Calculate accuracy percentage
from sklearn.metrics import classification_report Import detailed classification metrics
classification_report(y_test, predictions) Get precision, recall, F1-score
from sklearn.metrics import confusion_matrix Import confusion matrix
confusion_matrix(y_test, predictions) Create confusion matrix
from sklearn.metrics import mean_squared_error Import MSE metric
mean_squared_error(y_test, predictions) Calculate mean squared error
mean_squared_error(y_test, predictions, squared=False) Calculate RMSE
from sklearn.metrics import r2_score Import R² metric
r2_score(y_test, predictions) Calculate R² score
model.score(X_test, y_test) Get default score (accuracy or R²)

Basic Commands - Cross-Validation

Command Description
from sklearn.model_selection import cross_val_score Import cross-validation
cross_val_score(model, X, y, cv=5) Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=10) 10-fold cross-validation
scores.mean() Get mean cross-validation score
scores.std() Get standard deviation of scores
from sklearn.model_selection import cross_validate Import multi-metric CV
cross_validate(model, X, y, cv=5, scoring=['accuracy', 'precision']) CV with multiple metrics
from sklearn.model_selection import KFold Import K-fold splitter
kfold = KFold(n_splits=5, shuffle=True, random_state=42) Create K-fold object

Advanced Usage - Ensemble Methods

Command Description
from sklearn.ensemble import RandomForestClassifier Import random forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1) Create random forest with 100 trees
rf.feature_importances_ Get feature importance scores
from sklearn.ensemble import GradientBoostingClassifier Import gradient boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) Create gradient boosting model
from sklearn.ensemble import AdaBoostClassifier Import AdaBoost
ada = AdaBoostClassifier(n_estimators=50, learning_rate=1.0) Create AdaBoost classifier
from sklearn.ensemble import VotingClassifier Import voting ensemble
VotingClassifier(estimators=[('lr', model1), ('rf', model2)], voting='soft') Combine multiple models
from sklearn.ensemble import BaggingClassifier Import bagging ensemble
BaggingClassifier(base_estimator=tree, n_estimators=10) Create bagging ensemble
from sklearn.ensemble import StackingClassifier Import stacking ensemble
StackingClassifier(estimators=[...], final_estimator=LogisticRegression()) Stack models with meta-learner

Advanced Usage - Support Vector Machines

Command Description
from sklearn.svm import SVC Import SVM classifier
svm = SVC(kernel='rbf', C=1.0, gamma='scale') Create RBF kernel SVM
SVC(kernel='linear') Linear kernel SVM
SVC(kernel='poly', degree=3) Polynomial kernel SVM
SVC(probability=True) Enable probability estimates
from sklearn.svm import SVR Import SVM regressor
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1) Create SVM regressor
from sklearn.svm import LinearSVC Import linear SVM (faster)
LinearSVC(C=1.0, max_iter=1000) Linear SVM for large datasets

Advanced Usage - Clustering

Command Description
from sklearn.cluster import KMeans Import K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42) Create K-means with 3 clusters
clusters = kmeans.fit_predict(X) Fit and get cluster labels
kmeans.cluster_centers_ Get cluster centroids
kmeans.inertia_ Get within-cluster sum of squares
from sklearn.cluster import DBSCAN Import DBSCAN clustering
DBSCAN(eps=0.5, min_samples=5).fit_predict(X) Density-based clustering
from sklearn.cluster import AgglomerativeClustering Import hierarchical clustering
AgglomerativeClustering(n_clusters=3, linkage='ward') Hierarchical clustering
from sklearn.cluster import MeanShift Import mean shift clustering
MeanShift(bandwidth=2.0).fit_predict(X) Mean shift clustering

Advanced Usage - Dimensionality Reduction

Command Description
from sklearn.decomposition import PCA Import PCA
pca = PCA(n_components=2) Create PCA with 2 components
X_pca = pca.fit_transform(X) Transform to principal components
pca.explained_variance_ratio_ Get variance explained by each component
pca.components_ Get principal component vectors
PCA(n_components=0.95) Keep components explaining 95% variance
from sklearn.manifold import TSNE Import t-SNE
TSNE(n_components=2, perplexity=30).fit_transform(X) t-SNE dimensionality reduction
from sklearn.decomposition import TruncatedSVD Import truncated SVD
TruncatedSVD(n_components=100).fit_transform(X) SVD for sparse matrices
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis Import LDA
LDA(n_components=2).fit_transform(X, y) Supervised dimensionality reduction

Advanced Usage - Pipeline Construction

Command Description
from sklearn.pipeline import Pipeline Import pipeline
Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())]) Create processing pipeline
pipeline.fit(X_train, y_train) Fit entire pipeline
pipeline.predict(X_test) Predict using pipeline
from sklearn.pipeline import make_pipeline Import pipeline maker
make_pipeline(StandardScaler(), PCA(10), LogisticRegression()) Auto-name pipeline steps
pipeline.named_steps['scaler'] Access specific pipeline step
from sklearn.compose import ColumnTransformer Import column transformer
ColumnTransformer([('num', StandardScaler(), [0,1]), ('cat', OneHotEncoder(), [2])]) Different preprocessing per column
pipeline.get_params() Get all pipeline parameters
pipeline.set_params(clf__C=0.1) Set nested pipeline parameters

Advanced Usage - Hyperparameter Tuning

Command Description
from sklearn.model_selection import GridSearchCV Import grid search
GridSearchCV(model, param_grid, cv=5, n_jobs=-1) Exhaustive parameter search
grid_search.fit(X_train, y_train) Run grid search
grid_search.best_params_ Get best parameters found
grid_search.best_score_ Get best cross-validation score
grid_search.best_estimator_ Get best model
grid_search.cv_results_ Get detailed CV results
from sklearn.model_selection import RandomizedSearchCV Import randomized search
RandomizedSearchCV(model, param_distributions, n_iter=100, cv=5) Random parameter sampling
from scipy.stats import randint, uniform Import distributions for random search
param_distributions = {'n_estimators': randint(50, 200)} Define parameter distribution

Advanced Usage - Feature Selection

Command Description
from sklearn.feature_selection import SelectKBest Import K-best selector
SelectKBest(k=10).fit_transform(X, y) Select top 10 features
from sklearn.feature_selection import chi2, f_classif Import scoring functions
SelectKBest(score_func=chi2, k=5) Chi-squared feature selection
from sklearn.feature_selection import RFE Import recursive feature elimination
RFE(estimator=model, n_features_to_select=10).fit(X, y) Recursive feature elimination
from sklearn.feature_selection import SelectFromModel Import model-based selection
SelectFromModel(RandomForestClassifier()).fit(X, y) Select features by importance
selector.get_support() Get boolean mask of selected features
selector.transform(X) Transform to selected features only

Advanced Usage - Model Persistence

Command Description
import joblib Import joblib for model saving
joblib.dump(model, 'model.pkl') Save model to file
model = joblib.load('model.pkl') Load model from file
import pickle Import pickle module
pickle.dump(model, open('model.pkl', 'wb')) Save with pickle
model = pickle.load(open('model.pkl', 'rb')) Load with pickle
joblib.dump(model, 'model.pkl', compress=3) Save with compression

Configuration

Model Parameters Configuration

# Logistic Regression parameters
LogisticRegression(
    penalty='l2',           # Regularization type: 'l1', 'l2', 'elasticnet', 'none'
    C=1.0,                  # Inverse regularization strength (smaller = stronger)
    solver='lbfgs',         # Algorithm: 'lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'
    max_iter=100,           # Maximum iterations
    random_state=42,        # Random seed for reproducibility
    n_jobs=-1               # Use all CPU cores
)

# Random Forest parameters
RandomForestClassifier(
    n_estimators=100,       # Number of trees
    criterion='gini',       # Split quality: 'gini' or 'entropy'
    max_depth=None,         # Maximum tree depth (None = unlimited)
    min_samples_split=2,    # Minimum samples to split node
    min_samples_leaf=1,     # Minimum samples in leaf
    max_features='sqrt',    # Features per split: 'sqrt', 'log2', int, float
    bootstrap=True,         # Bootstrap sampling
    oob_score=False,        # Out-of-bag score estimation
    n_jobs=-1,              # Parallel jobs
    random_state=42,
    class_weight='balanced' # Handle imbalanced classes
)

# Support Vector Machine parameters
SVC(
    C=1.0,                  # Regularization parameter
    kernel='rbf',           # Kernel: 'linear', 'poly', 'rbf', 'sigmoid'
    degree=3,               # Polynomial degree (for 'poly' kernel)
    gamma='scale',          # Kernel coefficient: 'scale', 'auto', or float
    coef0=0.0,              # Independent term in kernel
    probability=False,      # Enable probability estimates (slower)
    cache_size=200,         # Kernel cache size (MB)
    class_weight=None,      # Class weights
    max_iter=-1             # Iteration limit (-1 = no limit)
)

# Gradient Boosting parameters
GradientBoostingClassifier(
    loss='log_loss',        # Loss function
    learning_rate=0.1,      # Shrinks contribution of each tree
    n_estimators=100,       # Number of boosting stages
    subsample=1.0,          # Fraction of samples for fitting
    criterion='friedman_mse', # Split quality measure
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth=3,            # Maximum tree depth
    max_features=None,      # Features per split
    validation_fraction=0.1, # Fraction for early stopping
    n_iter_no_change=None,  # Early stopping rounds
    random_state=42
)

Pipeline Configuration

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Complete pipeline with preprocessing
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42
    ))
])

# Pipeline with column-specific preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'income', 'score']),
        ('cat', OneHotEncoder(drop='first'), ['category', 'region'])
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

Cross-Validation Configuration

from sklearn.model_selection import cross_validate

# Multi-metric cross-validation
cv_results = cross_validate(
    estimator=model,
    X=X,
    y=y,
    cv=5,                           # Number of folds
    scoring={
        'accuracy': 'accuracy',
        'precision': 'precision_weighted',
        'recall': 'recall_weighted',
        'f1': 'f1_weighted',
        'roc_auc': 'roc_auc_ovr'
    },
    return_train_score=True,        # Include training scores
    return_estimator=True,          # Return fitted estimators
    n_jobs=-1,                      # Parallel processing
    verbose=1                       # Progress messages
)

Grid Search Configuration

from sklearn.model_selection import GridSearchCV

# Comprehensive parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    scoring='f1_weighted',          # Scoring metric
    cv=5,                           # Cross-validation folds
    n_jobs=-1,                      # Use all cores
    verbose=2,                      # Verbosity level
    refit=True,                     # Refit best model on full data
    return_train_score=True,        # Return training scores
    error_score='raise'             # How to handle errors
)

Common Use Cases

Use Case 1: Binary Classification with Imbalanced Data

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np

# Generate imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, weights=[0.9, 0.1], random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model with class balancing
model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # Handle imbalance
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Evaluate
predictions = model.predict(X_test_scaled)
probabilities = model.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, predictions))
print(f"ROC-AUC Score: {roc_auc_score(y_test, probabilities):.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))

Use Case 2: Multi-Class Text Classification Pipeline

```python from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score

Sample text data

texts = [ "Python is great for machine learning", "Java is used for enterprise applications", "JavaScript powers web development", # ... more texts ] labels = ['tech', 'tech', 'tech'] # Categories

Create text classification pipeline

text_clf = Pipeline([ ('tfidf', TfidfVectorizer( max_features=5000, ngram_range=(1, 2), stop_words='english'