# Logistic Regression parametersLogisticRegression(penalty='l2',# Regularization type: 'l1', 'l2', 'elasticnet', 'none'C=1.0,# Inverse regularization strength (smaller = stronger)solver='lbfgs',# Algorithm: 'lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'max_iter=100,# Maximum iterationsrandom_state=42,# Random seed for reproducibilityn_jobs=-1# Use all CPU cores)# Random Forest parametersRandomForestClassifier(n_estimators=100,# Number of treescriterion='gini',# Split quality: 'gini' or 'entropy'max_depth=None,# Maximum tree depth (None = unlimited)min_samples_split=2,# Minimum samples to split nodemin_samples_leaf=1,# Minimum samples in leafmax_features='sqrt',# Features per split: 'sqrt', 'log2', int, floatbootstrap=True,# Bootstrap samplingoob_score=False,# Out-of-bag score estimationn_jobs=-1,# Parallel jobsrandom_state=42,class_weight='balanced'# Handle imbalanced classes)# Support Vector Machine parametersSVC(C=1.0,# Regularization parameterkernel='rbf',# Kernel: 'linear', 'poly', 'rbf', 'sigmoid'degree=3,# Polynomial degree (for 'poly' kernel)gamma='scale',# Kernel coefficient: 'scale', 'auto', or floatcoef0=0.0,# Independent term in kernelprobability=False,# Enable probability estimates (slower)cache_size=200,# Kernel cache size (MB)class_weight=None,# Class weightsmax_iter=-1# Iteration limit (-1 = no limit))# Gradient Boosting parametersGradientBoostingClassifier(loss='log_loss',# Loss functionlearning_rate=0.1,# Shrinks contribution of each treen_estimators=100,# Number of boosting stagessubsample=1.0,# Fraction of samples for fittingcriterion='friedman_mse',# Split quality measuremin_samples_split=2,min_samples_leaf=1,max_depth=3,# Maximum tree depthmax_features=None,# Features per splitvalidation_fraction=0.1,# Fraction for early stoppingn_iter_no_change=None,# Early stopping roundsrandom_state=42)
fromsklearn.pipelineimportPipelinefromsklearn.preprocessingimportStandardScalerfromsklearn.decompositionimportPCAfromsklearn.ensembleimportRandomForestClassifier# Complete pipeline with preprocessingpipeline=Pipeline([('scaler',StandardScaler()),('pca',PCA(n_components=0.95)),('classifier',RandomForestClassifier(n_estimators=100,max_depth=10,random_state=42))])# Pipeline with column-specific preprocessingfromsklearn.composeimportColumnTransformerfromsklearn.preprocessingimportOneHotEncoderpreprocessor=ColumnTransformer(transformers=[('num',StandardScaler(),['age','income','score']),('cat',OneHotEncoder(drop='first'),['category','region'])],remainder='passthrough'# Keep other columns unchanged)full_pipeline=Pipeline([('preprocessor',preprocessor),('classifier',RandomForestClassifier())])
fromsklearn.model_selectionimportcross_validate# Multi-metric cross-validationcv_results=cross_validate(estimator=model,X=X,y=y,cv=5,# Number of foldsscoring={'accuracy':'accuracy','precision':'precision_weighted','recall':'recall_weighted','f1':'f1_weighted','roc_auc':'roc_auc_ovr'},return_train_score=True,# Include training scoresreturn_estimator=True,# Return fitted estimatorsn_jobs=-1,# Parallel processingverbose=1# Progress messages)
fromsklearn.model_selectionimportGridSearchCV# Comprehensive parameter gridparam_grid={'n_estimators':[50,100,200],'max_depth':[5,10,15,None],'min_samples_split':[2,5,10],'min_samples_leaf':[1,2,4],'max_features':['sqrt','log2'],'bootstrap':[True,False]}grid_search=GridSearchCV(estimator=RandomForestClassifier(),param_grid=param_grid,scoring='f1_weighted',# Scoring metriccv=5,# Cross-validation foldsn_jobs=-1,# Use all coresverbose=2,# Verbosity levelrefit=True,# Refit best model on full datareturn_train_score=True,# Return training scoreserror_score='raise'# How to handle errors)
Use Case 1: Binary Classification with Imbalanced Data¶
fromsklearn.datasetsimportmake_classificationfromsklearn.model_selectionimporttrain_test_splitfromsklearn.preprocessingimportStandardScalerfromsklearn.ensembleimportRandomForestClassifierfromsklearn.metricsimportclassification_report,confusion_matrix,roc_auc_scoreimportnumpyasnp# Generate imbalanced datasetX,y=make_classification(n_samples=1000,n_features=20,n_informative=15,n_redundant=5,weights=[0.9,0.1],random_state=42)# Split dataX_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)# Scale featuresscaler=StandardScaler()X_train_scaled=scaler.fit_transform(X_train)X_test_scaled=scaler.transform(X_test)# Train model with class balancingmodel=RandomForestClassifier(n_estimators=100,class_weight='balanced',# Handle imbalancerandom_state=42)model.fit(X_train_scaled,y_train)# Evaluatepredictions=model.predict(X_test_scaled)probabilities=model.predict_proba(X_test_scaled)[:,1]print(classification_report(y_test,predictions))print(f"ROC-AUC Score: {roc_auc_score(y_test,probabilities):.3f}")print("\nConfusion Matrix:")print(confusion_matrix(y_test,predictions))
Use Case 2: Multi-Class Text Classification Pipeline¶
```python
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
texts = [
"Python is great for machine learning",
"Java is used for enterprise applications",
"JavaScript powers web development",
# ... more texts
]
labels = ['tech', 'tech', 'tech'] # Categories