data-science
📋 Copy All pandas Commands
📄 Generate pandas PDF Guide
Pandas Cheatsheet
Installation
Platform
Command
pip (All platforms)
pip install pandas
conda
conda install pandas
Ubuntu/Debian
sudo apt-get install python3-pandas
macOS (Homebrew)
pip3 install pandas
With performance libs
pip install pandas[performance]
With all dependencies
pip install pandas[all]
Excel support
pip install pandas openpyxl xlrd xlsxwriter
SQL support
pip install pandas sqlalchemy psycopg2-binary
Verify installation
python -c "import pandas as pd; print(pd.__version__)"
Basic Commands - DataFrame Creation & I/O
Command
Description
pd.DataFrame({'col1': [1,2,3], 'col2': ['a','b','c']})
Create DataFrame from dictionary
pd.DataFrame([[1,'a'], [2,'b']], columns=['col1','col2'])
Create DataFrame from list of lists
pd.Series([1,2,3], name='my_series')
Create a Series (single column)
pd.read_csv('file.csv')
Read CSV file into DataFrame
pd.read_csv('file.csv', sep=';', encoding='utf-8')
Read CSV with custom delimiter and encoding
pd.read_excel('file.xlsx', sheet_name='Sheet1')
Read Excel file
pd.read_json('file.json')
Read JSON file
pd.read_sql('SELECT * FROM table', connection)
Read from SQL database
pd.read_parquet('file.parquet')
Read Parquet file
pd.read_clipboard()
Read data from clipboard
df.to_csv('output.csv', index=False)
Write DataFrame to CSV
df.to_excel('output.xlsx', sheet_name='Data')
Write DataFrame to Excel
df.to_json('output.json', orient='records')
Write DataFrame to JSON
df.to_sql('table_name', connection, if_exists='replace')
Write DataFrame to SQL database
Command
Description
df.head()
Display first 5 rows
df.head(10)
Display first 10 rows
df.tail()
Display last 5 rows
df.shape
Get dimensions (rows, columns)
df.columns
Get column names
df.dtypes
Get data types of all columns
df.index
Get index information
df.info()
Get concise summary of DataFrame
df.describe()
Get statistical summary of numeric columns
df.describe(include='all')
Get statistical summary of all columns
df.count()
Count non-null values per column
df.nunique()
Count unique values per column
df.memory_usage(deep=True)
Get memory usage of DataFrame
df.sample(5)
Get random sample of 5 rows
df.columns.tolist()
Get column names as list
Basic Commands - Selection & Filtering
Command
Description
df['column_name']
Select single column (returns Series)
df[['col1', 'col2']]
Select multiple columns (returns DataFrame)
df.iloc[0]
Select first row by position
df.iloc[0:5]
Select first 5 rows by position
df.iloc[:, 0:2]
Select all rows, first 2 columns
df.loc[0]
Select row by label/index
df.loc[:, 'col1':'col3']
Select columns by label range
df[df['column'] > 5]
Filter rows where column > 5
df[df['column'].isin(['val1', 'val2'])]
Filter rows by list of values
df[(df['col1'] > 5) & (df['col2'] < 10)]
Filter with multiple conditions (AND)
df[(df['col1'] > 5) | (df['col2'] < 10)]
Filter with multiple conditions (OR)
df[~(df['col1'] > 5)]
Filter with NOT condition
df.query('col1 > 5 and col2 == "value"')
Filter using SQL-like query string
df.nlargest(5, 'column')
Get 5 rows with largest values in column
df.nsmallest(5, 'column')
Get 5 rows with smallest values in column
Basic Commands - Data Manipulation
Command
Description
df['new_col'] = df['col1'] + df['col2']
Add new column with calculation
df.drop('column', axis=1, inplace=True)
Delete column
df.drop([0, 1], axis=0, inplace=True)
Delete rows by index
df.rename(columns={'old': 'new'}, inplace=True)
Rename columns
df.sort_values('column', ascending=False)
Sort by column values descending
df.sort_values(['col1', 'col2'])
Sort by multiple columns
df.sort_index()
Sort by index
df.reset_index(drop=True, inplace=True)
Reset index to default integer index
df.set_index('column', inplace=True)
Set column as index
df['column'].unique()
Get unique values in column
df['column'].value_counts()
Count occurrences of each value
df.drop_duplicates()
Remove duplicate rows
df.drop_duplicates(subset=['col1'], keep='first')
Remove duplicates based on specific columns
df.fillna(0)
Fill missing values with 0
df.dropna()
Remove rows with any missing values
Basic Commands - Aggregation
Command
Description
df['column'].sum()
Sum of column values
df['column'].mean()
Mean of column values
df['column'].median()
Median of column values
df['column'].std()
Standard deviation
df['column'].var()
Variance
df['column'].min()
Minimum value
df['column'].max()
Maximum value
df['column'].quantile(0.75)
Get 75th percentile
df.groupby('column').sum()
Group by column and sum
df.groupby('column')['other_col'].mean()
Group by and calculate mean of specific column
df.groupby(['col1', 'col2']).size()
Count rows per group
df.groupby('col').agg({'col2': 'sum', 'col3': 'mean'})
Multiple aggregations
df.corr()
Correlation matrix of numeric columns
df['col1'].corr(df['col2'])
Correlation between two columns
Advanced Usage - Complex Selection & Filtering
Command
Description
df.select_dtypes(include=['float64', 'int64'])
Select columns by data type
df.select_dtypes(exclude=['object'])
Exclude columns by data type
df[df['col'].str.contains('pattern', na=False)]
Filter by string pattern
df[df['col'].str.startswith('prefix')]
Filter by string prefix
df[df['col'].str.endswith('suffix')]
Filter by string suffix
df[df['col'].str.match(r'^pattern.*')]
Filter by regex pattern
df.loc[lambda x: x['col'] > x['col'].mean()]
Filter using callable/lambda
df.at[0, 'column']
Fast scalar value access
df.iat[0, 0]
Fast scalar value access by position
df.xs('key', level='level_name')
Cross-section selection in MultiIndex
df.filter(like='pattern')
Filter columns by name pattern
df.filter(regex=r'^col\d+')
Filter columns by regex
Command
Description
df['col'].apply(lambda x: x * 2)
Apply function to each element
df.apply(lambda x: x.max() - x.min())
Apply function to each column
df.applymap(lambda x: x * 2)
Apply function element-wise (deprecated, use map)
df.map(lambda x: x * 2)
Apply function element-wise to DataFrame
df['col'].map({'old': 'new', 'old2': 'new2'})
Map values using dictionary
df['col'].replace({'old': 'new'}, inplace=True)
Replace values
df.replace([np.inf, -np.inf], np.nan)
Replace infinite values with NaN
df.astype({'col1': 'int64', 'col2': 'float64'})
Change data types
pd.to_datetime(df['date_col'])
Convert to datetime
pd.to_numeric(df['col'], errors='coerce')
Convert to numeric, invalid → NaN
df['col'].str.lower()
Convert strings to lowercase
df['col'].str.strip()
Remove leading/trailing whitespace
df['col'].str.split(',', expand=True)
Split strings into columns
pd.cut(df['col'], bins=5)
Bin continuous values into intervals
pd.qcut(df['col'], q=4)
Bin values into quantile-based intervals
Advanced Usage - Merging & Joining
Command
Description
pd.merge(df1, df2, on='key', how='inner')
Inner join on key column
pd.merge(df1, df2, on='key', how='left')
Left join
pd.merge(df1, df2, on='key', how='right')
Right join
pd.merge(df1, df2, on='key', how='outer')
Full outer join
pd.merge(df1, df2, left_on='key1', right_on='key2')
Join on different column names
pd.merge(df1, df2, on=['key1', 'key2'])
Join on multiple keys
pd.merge(df1, df2, on='key', indicator=True)
Add merge indicator column
df1.join(df2, how='left', lsuffix='_l', rsuffix='_r')
Index-based join with suffixes
pd.concat([df1, df2], axis=0, ignore_index=True)
Concatenate vertically
pd.concat([df1, df2], axis=1)
Concatenate horizontally
pd.concat([df1, df2], keys=['df1', 'df2'])
Concatenate with hierarchical index
pd.merge_asof(df1, df2, on='timestamp')
Merge on nearest key (time series)
Advanced Usage - Reshaping & Pivoting
Command
Description
df.pivot(index='row', columns='col', values='val')
Reshape data (pivot)
df.pivot_table(values='val', index='row', columns='col', aggfunc='mean')
Create pivot table with aggregation
pd.melt(df, id_vars=['id'], value_vars=['col1', 'col2'])
Unpivot DataFrame (wide to long)
df.stack()
Pivot columns to rows (wide to long)
df.unstack()
Pivot rows to columns (long to wide)
pd.crosstab(df['col1'], df['col2'])
Create cross-tabulation
pd.crosstab(df['col1'], df['col2'], margins=True)
Cross-tabulation with totals
df.transpose() or df.T
Transpose DataFrame (swap rows/columns)
df.explode('column')
Transform list-like values into rows
pd.get_dummies(df['category'])
One-hot encode categorical variable
pd.get_dummies(df, columns=['cat1', 'cat2'])
One-hot encode multiple columns
Advanced Usage - GroupBy Operations
Command
Description
df.groupby('col').agg(['sum', 'mean', 'std'])
Multiple aggregations
df.groupby('col').agg({'col1': 'sum', 'col2': 'mean'})
Different aggregations per column
df.groupby('col').agg(total=('val', 'sum'), avg=('val', 'mean'))
Named aggregations
df.groupby('col').transform('mean')
Transform with group-wise operation
df.groupby('col').filter(lambda x: len(x) > 5)
Filter groups by condition
df.groupby('col').nth(0)
Get first row of each group
df.groupby('col').cumsum()
Cumulative sum within groups
df.groupby('col').rank()
Rank within groups
df.groupby('col').shift(1)
Shift values within groups
df.groupby('col')['val'].expanding().mean()
Expanding window within groups
Advanced Usage - Time Series
Command
Description
df.set_index('date_col', inplace=True)
Set datetime column as index
df.resample('D').mean()
Resample to daily frequency with mean
df.resample('M').sum()
Resample to monthly frequency with sum
df.resample('W').agg({'col1': 'sum', 'col2': 'mean'})
Resample with multiple aggregations
df.rolling(window=7).mean()
7-day rolling average
df.rolling(window=7).sum()
7-day rolling sum
df.expanding().mean()
Expanding window mean
df.ewm(span=10).mean()
Exponentially weighted moving average
df.shift(1)
Shift values down by 1 period
df.diff()
Calculate difference with previous row
df.pct_change()
Calculate percentage change
pd.date_range('2023-01-01', periods=10, freq='D')
Generate date range
df['date'].dt.year
Extract year from datetime
df['date'].dt.month
Extract month from datetime
df['date'].dt.dayofweek
Extract day of week (0=Monday)
Configuration
Display Options
# Set maximum rows to display
pd . set_option ( 'display.max_rows' , 100 )
# Set maximum columns to display
pd . set_option ( 'display.max_columns' , 50 )
# Set column width
pd . set_option ( 'display.max_colwidth' , 100 )
# Set float display precision
pd . set_option ( 'display.precision' , 2 )
# Display all columns
pd . set_option ( 'display.expand_frame_repr' , False )
# Show full dataframe without truncation
pd . set_option ( 'display.max_rows' , None )
pd . set_option ( 'display.max_columns' , None )
# Reset all options to defaults
pd . reset_option ( 'all' )
# Get current option value
pd . get_option ( 'display.max_rows' )
# Use copy-on-write mode (pandas 2.0+)
pd . options . mode . copy_on_write = True
# Set computation mode
pd . set_option ( 'compute.use_bottleneck' , True )
pd . set_option ( 'compute.use_numexpr' , True )
# Set chained assignment warning
pd . set_option ( 'mode.chained_assignment' , 'warn' ) # or 'raise' or None
I/O Options
# Set default CSV separator
pd . set_option ( 'io.excel.xlsx.writer' , 'xlsxwriter' )
# Set clipboard encoding
pd . set_option ( 'display.clipboard.encoding' , 'utf-8' )
Common Use Cases
Use Case 1: Data Cleaning Pipeline
import pandas as pd
# Read data
df = pd . read_csv ( 'raw_data.csv' )
# Remove duplicates
df = df . drop_duplicates ()
# Handle missing values
df [ 'column1' ] . fillna ( df [ 'column1' ] . mean (), inplace = True )
df . dropna ( subset = [ 'critical_column' ], inplace = True )
# Fix data types
df [ 'date' ] = pd . to_datetime ( df [ 'date' ])
df [ 'amount' ] = pd . to_numeric ( df [ 'amount' ], errors = 'coerce' )
# Remove outliers (values beyond 3 standard deviations)
df = df [ np . abs ( df [ 'value' ] - df [ 'value' ] . mean ()) <= ( 3 * df [ 'value' ] . std ())]
# Standardize text
df [ 'name' ] = df [ 'name' ] . str . strip () . str . lower ()
# Save cleaned data
df . to_csv ( 'cleaned_data.csv' , index = False )
Use Case 2: Exploratory Data Analysis
import pandas as pd
# Load data
df = pd . read_csv ( 'sales_data.csv' )
# Basic statistics
print ( df . describe ())
print ( df . info ())
# Check for missing values
print ( df . isnull () . sum ())
# Analyze categorical variables
print ( df [ 'category' ] . value_counts ())
print ( df [ 'region' ] . value_counts ( normalize = True ))
# Correlation analysis
correlation_matrix = df . corr ()
print ( correlation_matrix )
# Group analysis
sales_by_region = df . groupby ( 'region' ) . agg ({
'sales' : [ 'sum' , 'mean' , 'count' ],
'profit' : 'sum'
})
print ( sales_by_region )
# Time-based analysis
df [ 'date' ] = pd . to_datetime ( df [ 'date' ])
monthly_sales = df . set_index ( 'date' ) . resample ( 'M' )[ 'sales' ] . sum ()
print ( monthly_sales )
Use Case 3: Data Integration from Multiple Sources
import pandas as pd
# Read from different sources
customers = pd . read_csv ( 'customers.csv' )
orders = pd . read_excel ( 'orders.xlsx' )
products = pd . read_json ( 'products.json' )
# Merge datasets
df = pd . merge ( orders , customers , on = 'customer_id' , how = 'left' )
df = pd . merge ( df , products , on = 'product_id' , how = 'left' )
# Calculate derived metrics
df [ 'total_price' ] = df [ 'quantity' ] * df [ 'unit_price' ]
df [ 'profit' ] = df [ 'total_price' ] - df [ 'cost' ]
# Aggregate by customer
customer_summary = df . groupby ( 'customer_id' ) . agg ({
'order_id' : 'count' ,
'total_price' : 'sum' ,
'profit' : 'sum'
}) . rename ( columns = { 'order_id' : 'order_count' })
# Export results
customer_summary . to_excel ( 'customer_summary.xlsx' )
Use Case 4: Time Series Analysis
import pandas as pd
# Read time series data
df = pd . read_csv ( 'stock_prices.csv' , parse_dates = [ 'date' ])
df . set_index ( 'date' , inplace = True )
# Calculate returns
df [ 'daily_return' ] = df [ 'close' ] . pct_change ()
# Calculate moving averages
df [ 'MA_7' ] = df [ 'close' ] . rolling ( window = 7 ) . mean ()
df [ 'MA_30' ] = df [ 'close' ] . rolling ( window = 30 ) . mean ()
# Calculate exponential moving average
df [ 'EMA_12' ] = df [ 'close' ] . ewm ( span = 12 ) . mean ()
# Resample to weekly data
weekly_df = df . resample ( 'W' ) . agg ({
'open' : 'first' ,
'high' : 'max' ,
'low' : 'min' ,
'close' : 'last' ,
'volume' : 'sum'
})
# Calculate volatility (rolling standard deviation)
df [ 'volatility' ] = df [ 'daily_return' ] . rolling ( window = 30 ) . std ()
# Export analysis
df . to_csv ( 'stock_analysis.csv' )
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load data
df = pd . read_csv ( 'ml_dataset.csv' )
# Handle missing values
df . fillna ( df . median ( numeric_only = True ), inplace = True )
# Encode categorical variables
df = pd . get_dummies ( df , columns = [ 'category' , 'region' ], drop_first = True )
# Create features
df [ 'feature_ratio' ] = df [ 'feature1' ] / ( df [ 'feature2' ] + 1 )
df [ 'feature_interaction' ] = df [ 'feature1' ] * df [ 'feature2' ]
# Separate features and target
X = df . drop ( 'target' , axis = 1 )
y = df [ 'target' ]
# Split data
X_train , X_test , y_train , y_test = train_test_split (
X , y , test_size = 0.2 , random_state = 42
)
# Scale features
scaler = StandardScaler ()
X_train_scaled = pd . DataFrame (
scaler . fit_transform ( X_train ),
columns = X_train . columns ,
index = X_train . index
)
X_test_scaled = pd . DataFrame (
scaler . transform ( X_test ),
columns = X_test . columns ,
index = X_test . index
)
Best Practices
Use inplace=False by default : Avoid inplace=True to maintain immutability and make debugging easier. Assign results to new variables instead.
Vectorize operations : Use pandas built-in vectorized operations instead of loops. Operations like df['col'] * 2 are much faster than df['col'].apply(lambda x: x * 2).
Chain operations efficiently : Use method chaining with proper formatting for readable code: df.query('col > 5').groupby('category').sum().
Specify dtypes when reading data : Use dtype parameter in read_csv() to reduce memory usage and improve performance: pd.read_csv('file.csv', dtype={'col1': 'int32'}).
Use categorical data type for low-cardinality columns : Convert string columns with few unique values to categorical: df['category'] = df['category'].astype('category') to save memory.
Handle missing values explicitly : Always check for and handle missing values consciously rather than letting them propagate: df.isnull().sum() before processing.
Use copy() when needed : When creating a subset that you'll modify