Pandas is the backbone of data processing in Python. Whether you’re cleaning messy CSVs, aggregating millions of rows, or preparing datasets for machine learning, Pandas has the tools you need. But knowing which tools to use — and how to use them efficiently — makes all the difference.
Here are practical tips and tricks that will make your Pandas code faster, cleaner, and more maintainable.
import pandas as pd
# Bad: Pandas infers types, often using 64-bit by default
df = pd.read_csv('large_file.csv')
# Good: Specify optimal dtypes
dtypes = {
'id': 'int32',
'category': 'category',
'price': 'float32',
'is_active': 'bool',
'name': 'string',
'date': 'str', # Parse dates separately
}
df = pd.read_csv('large_file.csv', dtype=dtypes, parse_dates=['date'])
# Only load the columns you need
df = pd.read_csv(
'large_file.csv',
usecols=['id', 'name', 'price', 'date'],
dtype={'id': 'int32', 'price': 'float32'}
)
# Process a huge file in chunks
chunk_size = 100_000
results = []
for chunk in pd.read_csv('huge_file.csv', chunksize=chunk_size):
# Process each chunk
filtered = chunk[chunk['price'] > 100]
results.append(filtered.groupby('category')['price'].mean())
# Combine results
final = pd.concat(results).groupby(level=0).mean()
The single biggest performance improvement you can make is replacing loops with vectorized operations:
import numpy as np
# Slow: Row-by-row iteration
def categorize_price_slow(df):
categories = []
for _, row in df.iterrows():
if row['price'] < 10:
categories.append('budget')
elif row['price'] < 50:
categories.append('mid-range')
else:
categories.append('premium')
df['category'] = categories
return df
# Fast: Vectorized with np.where
def categorize_price_fast(df):
df['category'] = np.where(
df['price'] < 10, 'budget',
np.where(df['price'] < 50, 'mid-range', 'premium')
)
return df
# Even better: pd.cut for binning
df['category'] = pd.cut(
df['price'],
bins=[0, 10, 50, float('inf')],
labels=['budget', 'mid-range', 'premium']
)
# Vectorized string methods
df['name_clean'] = (
df['name']
.str.strip()
.str.lower()
.str.replace(r'[^\w\s]', '', regex=True)
.str[:50] # Truncate to 50 characters
)
# Check for patterns
has_email = df['text'].str.contains(r'\S+@\S+\.\S+', regex=True)
result = (
pd.read_csv('sales.csv')
.assign(
date=lambda df: pd.to_datetime(df['date']),
revenue=lambda df: df['quantity'] * df['price']
)
.query('revenue > 0 & quantity > 0')
.assign(
category=lambda df: pd.cut(
df['revenue'],
bins=[0, 100, 1000, float('inf')],
labels=['low', 'medium', 'high']
)
)
.groupby(['region', 'category'], observed=True)
.agg(
total_revenue=('revenue', 'sum'),
avg_revenue=('revenue', 'mean'),
order_count=('order_id', 'nunique'),
top_product=('product', lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else None)
)
.reset_index()
.sort_values('total_revenue', ascending=False)
.head(20)
)
# Diagnose missing data
missing = (
df.isnull()
.sum()
.sort_values(ascending=False)
.head(20)
)
missing_pct = (missing / len(df) * 100).round(1)
print(f"Missing values:\n{missing_pct[missing_pct > 0]}")
# Strategic filling
df['price'] = df['price'].fillna(df.groupby('category')['price'].transform('median'))
df['name'] = df['name'].fillna('Unknown')
df['is_active'] = df['is_active'].fillna(False)
# Forward fill for time series
df['value'] = df['value'].ffill(limit=7) # Fill up to 7 consecutive gaps
# Interpolation for numeric data
df['temperature'] = df['temperature'].interpolate(method='linear', limit=5)
# Drop columns with too many missing values
threshold = 0.5 # Drop columns with >50% missing
df = df.loc[:, df.isnull().mean() < threshold]
# Multiple aggregations at once
agg_result = df.groupby('department').agg(
total_sales=('sales', 'sum'),
avg_sales=('sales', 'mean'),
num_employees=('employee_id', 'nunique'),
best_month=('sales', lambda x: x.idxmax()),
sales_std=('sales', 'std'),
).round(2)
# Named aggregations (cleaner output)
agg_result = df.groupby('region', observed=True).agg(
revenue_total=pd.NamedAgg(column='revenue', aggfunc='sum'),
revenue_avg=pd.NamedAgg(column='revenue', aggfunc='mean'),
orders=pd.NamedAgg(column='order_id', aggfunc='nunique'),
)
# Transform: add group-level stats back to each row
df['dept_avg'] = df.groupby('department')['salary'].transform('mean')
df['salary_rank'] = df.groupby('department')['salary'].rank(pct=True)
# Filter groups based on conditions
large_departments = (
df.groupby('department')
.filter(lambda x: len(x) >= 10)
)
# Different merge types for different use cases
# Inner merge: only matching rows
merged = pd.merge(orders, customers, on='customer_id', how='inner')
# Left merge: keep all from left
merged = pd.merge(orders, customers, on='customer_id', how='left')
# Handle duplicate keys with suffixes
merged = pd.merge(
df1, df2,
on='id',
how='outer',
suffixes=('_left', '_right'),
indicator=True # Adds '_merge' column showing merge origin
)
# Merge on multiple columns
merged = pd.merge(
sales, products,
left_on=['product_code', 'region'],
right_on=['code', 'region'],
how='left'
)
# Anti-join: find rows in A not in B
anti_join = df1[~df1['key'].isin(df2['key'])]
# Set datetime index
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()
# Resampling
daily = df.resample('D')['sales'].sum()
weekly = df.resample('W')['sales'].mean()
monthly = df.resample('ME')['revenue'].agg(['sum', 'mean', 'count'])
# Rolling windows
df['rolling_7d_avg'] = df['sales'].rolling(window=7).mean()
df['rolling_30d_std'] = df['sales'].rolling(window=30).std()
# Expanding windows
df['cumulative_avg'] = df['sales'].expanding().mean()
# Shift and difference
df['sales_change'] = df['sales'].diff()
df['sales_yoy'] = df['sales'].shift(365) # Year over year
df['pct_change'] = df['sales'].pct_change()
# Pivot table
pivot = df.pivot_table(
values='revenue',
index='region',
columns='product_category',
aggfunc='sum',
fill_value=0,
margins=True, # Add row/column totals
margins_name='Total'
)
# Crosstab for frequency analysis
ct = pd.crosstab(
df['department'],
df['satisfaction_level'],
values=df['salary'],
aggfunc='mean',
normalize='index' # Show percentages within each department
).round(3)
from tqdm import tqdm
tqdm.pandas()
def process_row(row):
"""Complex row-wise operation."""
# Expensive computation here
return result
df['result'] = df.progress_apply(process_row, axis=1)
# CSV with optimal settings
df.to_csv(
'output.csv',
index=False,
encoding='utf-8',
date_format='%Y-%m-%d',
float_format='%.2f'
)
# Parquet for large datasets (much faster and smaller)
df.to_parquet('output.parquet', engine='pyarrow', compression='snappy')
# Excel with formatting
with pd.ExcelWriter('report.xlsx', engine='openpyxl') as writer:
df_summary.to_excel(writer, sheet_name='Summary', index=False)
df_detail.to_excel(writer, sheet_name='Detail', index=False)
# Auto-adjust column widths
for sheet_name in writer.sheets:
worksheet = writer.sheets[sheet_name]
for column in worksheet.columns:
max_length = max(
len(str(cell.value)) for cell in column
)
worksheet.column_dimensions[
column[0].column_letter
].width = min(max_length + 2, 50)
category dtype for low-cardinality string columnsint32/float32 instead of 64-bit when precision allowsiterrows() — use vectorized operations or itertuples()eval() for complex expressions on large DataFramespolars or duckdb for datasets exceeding memory%timeit in Jupyter before optimizingThese tips should help you write Pandas code that’s not just correct, but fast and maintainable. The key is to always think in terms of vectorized operations and let Pandas do the heavy lifting at the C level.