Appendix B: pandas Cheat Sheet
Python for Business for Beginners: Coding for Every Person
A quick reference for pandas operations. All examples use Python 3.10+ and pandas 2.0+. Import conventions assumed throughout:
import pandas as pd
import numpy as np
Reading Data
From CSV
df = pd.read_csv("sales.csv")
# With options
df = pd.read_csv(
"sales.csv",
encoding="utf-8", # or "latin-1" for older Windows files
sep=",", # separator (default comma)
header=0, # row number for column names (default 0)
index_col=None, # column to use as index
usecols=["date", "region", "revenue"], # only load specific columns
dtype={"customer_id": str}, # force specific column types
parse_dates=["date"], # parse as datetime automatically
nrows=1000, # only read first 1000 rows
skiprows=2, # skip first 2 rows
na_values=["N/A", "NULL", "-"], # additional NA strings
thousands=",", # thousands separator in numbers
decimal=".", # decimal separator
)
# From URL
df = pd.read_csv("https://example.com/data.csv")
From Excel
df = pd.read_excel("report.xlsx")
# With options
df = pd.read_excel(
"report.xlsx",
sheet_name="Q3 Sales", # specific sheet by name
sheet_name=0, # or by position (0 = first sheet)
header=0,
usecols="A:F", # Excel column range
skiprows=3, # skip header rows
nrows=500,
)
# Load all sheets as a dict
all_sheets = pd.read_excel("report.xlsx", sheet_name=None)
# all_sheets["Sheet1"], all_sheets["Sheet2"], etc.
From JSON
df = pd.read_json("data.json")
# From API response (list of records)
import requests
response = requests.get("https://api.example.com/records")
df = pd.DataFrame(response.json())
# From nested JSON
df = pd.json_normalize(data, record_path=["orders"], meta=["customer_id"])
From SQL
import sqlite3
conn = sqlite3.connect("database.sqlite")
df = pd.read_sql("SELECT * FROM orders WHERE region = 'North'", conn)
df = pd.read_sql(
"SELECT o.*, c.name FROM orders o JOIN customers c ON o.customer_id = c.id",
conn,
parse_dates=["order_date"],
)
conn.close()
# With SQLAlchemy (for production databases)
from sqlalchemy import create_engine
engine = create_engine("postgresql://user:password@host/dbname")
df = pd.read_sql("SELECT * FROM sales", engine)
From Clipboard (quick data entry)
df = pd.read_clipboard() # paste Excel/table data directly from clipboard
DataFrame Inspection
df.shape # (rows, columns)
df.shape[0] # number of rows
df.shape[1] # number of columns
len(df) # number of rows (same as df.shape[0])
df.head() # first 5 rows
df.head(10) # first 10 rows
df.tail() # last 5 rows
df.info() # column names, dtypes, non-null counts, memory
df.describe() # summary statistics for numeric columns
df.describe(include="all") # include string columns too
df.dtypes # dtype of each column
df.columns # column names (as Index)
df.index # row labels (as Index)
df.columns.tolist() # column names as Python list
df.dtypes.to_dict() # column names to dtypes as dict
df.nunique() # count of unique values per column
df.value_counts("region") # frequency table for one column
df["region"].unique() # array of unique values
df["region"].value_counts() # frequency table for one column
df.memory_usage(deep=True) # memory used by each column
Selection
Columns
df["revenue"] # single column as Series
df[["revenue", "region"]] # multiple columns as DataFrame
df.revenue # attribute access (avoid — breaks with spaces)
# Select by dtype
df.select_dtypes(include="number") # all numeric columns
df.select_dtypes(include="object") # all string/object columns
df.select_dtypes(exclude="datetime64") # exclude datetime columns
Rows — .loc (label-based)
df.loc[0] # row with index label 0
df.loc[5:10] # rows with labels 5 through 10 (inclusive)
df.loc[[0, 5, 10]] # specific rows by label
df.loc[0, "revenue"] # single value: row 0, column "revenue"
df.loc[5:10, ["region", "revenue"]] # rows 5-10, specific columns
df.loc[5:10, "region":"cost"] # rows 5-10, column range
Rows — .iloc (position-based)
df.iloc[0] # first row (position 0)
df.iloc[-1] # last row
df.iloc[5:10] # rows at positions 5, 6, 7, 8, 9
df.iloc[[0, 5, 10]] # specific positions
df.iloc[0, 0] # first row, first column
df.iloc[:5, :3] # first 5 rows, first 3 columns
.at and .iat (single value access — faster)
df.at[0, "revenue"] # value at row label 0, column "revenue"
df.iat[0, 2] # value at row 0, column position 2
Filtering
Boolean Indexing
# Single condition
df[df["revenue"] > 10000]
df[df["region"] == "North"]
df[df["status"] != "cancelled"]
# Multiple conditions (use & for AND, | for OR, ~ for NOT)
df[(df["region"] == "North") & (df["revenue"] > 10000)]
df[(df["region"] == "North") | (df["region"] == "South")]
df[~(df["status"] == "cancelled")] # NOT
# Common gotcha: always use parentheses around each condition
# Wrong: df[df["a"] > 1 & df["b"] < 10] # operator precedence bug
# Right: df[(df["a"] > 1) & (df["b"] < 10)]
.query() method (readable for complex filters)
df.query("revenue > 10000")
df.query("region == 'North' and revenue > 10000")
df.query("region in ['North', 'South']")
df.query("revenue.between(5000, 15000)")
# Use @ to reference Python variables
min_revenue = 5000
df.query("revenue > @min_revenue")
.isin() and .between()
df[df["region"].isin(["North", "South", "East"])]
df[~df["region"].isin(["cancelled", "refunded"])] # NOT in list
df[df["revenue"].between(5000, 15000)] # inclusive by default
df[df["date"].between("2024-01-01", "2024-12-31")]
.str methods for filtering strings
df[df["name"].str.contains("Corp")] # contains substring
df[df["name"].str.startswith("Acme")]
df[df["email"].str.endswith("@example.com")]
df[df["code"].str.match(r"^[A-Z]{2}\d{4}$")] # regex match
df[df["name"].str.lower() == "acme corp"] # case-insensitive
Sorting
df.sort_values("revenue") # ascending by default
df.sort_values("revenue", ascending=False) # descending
df.sort_values(["region", "revenue"]) # multiple columns
df.sort_values(
["region", "revenue"],
ascending=[True, False], # asc by region, desc by revenue
)
df.sort_index() # sort by index
df.sort_index(ascending=False)
# nlargest / nsmallest (faster than sort + head for large data)
df.nlargest(10, "revenue") # top 10 rows by revenue
df.nsmallest(5, "cost") # bottom 5 rows by cost
Missing Data
# Detection
df.isna() # boolean DataFrame: True where NaN
df.notna() # True where NOT NaN
df.isna().sum() # count of NaN per column
df.isna().sum().sum() # total NaN count
df["revenue"].isna().any() # True if any NaN in column
df["revenue"].isna().all() # True if all NaN in column
# Filling
df.fillna(0) # fill all NaN with 0
df["revenue"].fillna(0) # fill NaN in one column
df.fillna({"revenue": 0, "region": "Unknown"}) # per-column fill
df.fillna(method="ffill") # forward fill (deprecated — see below)
df.ffill() # forward fill (pandas 2.0+)
df.bfill() # backward fill (pandas 2.0+)
df["revenue"].fillna(df["revenue"].mean()) # fill with column mean
# Dropping
df.dropna() # drop any row with any NaN
df.dropna(how="all") # only drop rows where ALL values are NaN
df.dropna(subset=["customer_id"]) # drop only if key column is NaN
df.dropna(axis=1) # drop columns with any NaN
df.dropna(thresh=5) # keep rows with at least 5 non-NaN values
Adding and Modifying Columns
# Add a new calculated column
df["margin"] = (df["revenue"] - df["cost"]) / df["revenue"]
# Conditional column
df["tier"] = "standard"
df.loc[df["revenue"] > 50000, "tier"] = "premium"
df.loc[df["revenue"] > 200000, "tier"] = "enterprise"
# Using np.where (vectorized if/else)
df["is_profitable"] = np.where(df["margin"] > 0, True, False)
# Using np.select (multiple conditions)
conditions = [
df["revenue"] > 100000,
df["revenue"] > 50000,
df["revenue"] > 10000,
]
choices = ["Large", "Medium", "Small"]
df["size"] = np.select(conditions, choices, default="Micro")
# String column operations
df["name_clean"] = df["name"].str.strip().str.title()
df["email_domain"] = df["email"].str.split("@").str[1]
df["code_upper"] = df["code"].str.upper()
# Rename columns
df.rename(columns={"rev": "revenue", "cust": "customer"}, inplace=True)
# Drop columns
df.drop(columns=["temp_col", "debug_col"])
df.drop(columns=["temp_col"], inplace=True) # modify in place
GroupBy Operations
# Single-column groupby
df.groupby("region")["revenue"].sum()
df.groupby("region")["revenue"].mean()
df.groupby("region")["revenue"].agg(["sum", "mean", "count", "max"])
# Multi-column groupby
df.groupby(["region", "product"])["revenue"].sum()
# Named aggregations (pandas 1.0+)
result = df.groupby("region").agg(
total_revenue=("revenue", "sum"),
avg_order=("revenue", "mean"),
order_count=("revenue", "count"),
max_order=("revenue", "max"),
)
# Multiple aggregations for multiple columns
result = df.groupby("region").agg(
{
"revenue": ["sum", "mean"],
"cost": "sum",
"customer_id": "nunique",
}
)
# Custom aggregation function
result = df.groupby("region")["revenue"].agg(
lambda x: x.quantile(0.75) # 75th percentile by region
)
# transform: return result with same shape as original
df["revenue_pct_of_region"] = (
df["revenue"] /
df.groupby("region")["revenue"].transform("sum")
)
# reset_index: make group keys into regular columns
df.groupby("region")["revenue"].sum().reset_index()
# size: count rows per group
df.groupby("region").size()
# filter: keep groups matching a condition
high_volume = df.groupby("region").filter(
lambda g: g["revenue"].sum() > 100000
)
Merge and Join
# merge (like SQL JOIN)
merged = pd.merge(orders, customers, on="customer_id")
# Different column names
merged = pd.merge(
orders,
customers,
left_on="cust_id",
right_on="customer_id",
)
# Join types
inner = pd.merge(a, b, on="id", how="inner") # only matching rows
left = pd.merge(a, b, on="id", how="left") # all rows from a
right = pd.merge(a, b, on="id", how="right") # all rows from b
outer = pd.merge(a, b, on="id", how="outer") # all rows from both
# Multi-key merge
merged = pd.merge(a, b, on=["region", "product"])
# Check for duplicates after merge
assert len(merged) == expected_rows, "Unexpected row count after merge"
# concat: stack DataFrames vertically
combined = pd.concat([q1, q2, q3, q4])
combined = pd.concat([q1, q2, q3, q4], ignore_index=True) # reset index
# concat horizontally (side by side)
wide = pd.concat([df_a, df_b], axis=1)
# join: merge on index
result = df1.join(df2, how="left")
Pivot Tables
# Basic pivot table
pivot = df.pivot_table(
values="revenue",
index="region",
columns="product",
aggfunc="sum",
)
# Multiple aggregations
pivot = df.pivot_table(
values="revenue",
index=["region", "year"],
columns="quarter",
aggfunc=["sum", "mean"],
fill_value=0, # replace NaN with 0
margins=True, # add row/column totals
)
# pivot (without aggregation — requires unique index/column combinations)
pivot = df.pivot(index="date", columns="region", values="revenue")
# melt: unpivot (wide to long format)
melted = pd.melt(
pivot_df,
id_vars=["date"],
value_vars=["North", "South", "East"],
var_name="region",
value_name="revenue",
)
Apply Functions
# apply on a Series
df["revenue_thousands"] = df["revenue"].apply(lambda x: x / 1000)
# apply with a named function (preferred for complex logic)
def categorize_revenue(amount: float) -> str:
if amount > 100000:
return "Large"
elif amount > 10000:
return "Medium"
return "Small"
df["category"] = df["revenue"].apply(categorize_revenue)
# apply on a DataFrame row (axis=1)
def calculate_margin(row: pd.Series) -> float:
if row["revenue"] == 0:
return 0.0
return (row["revenue"] - row["cost"]) / row["revenue"]
df["margin"] = df.apply(calculate_margin, axis=1)
# map: element-wise transformation on a Series
size_map = {"S": "Small", "M": "Medium", "L": "Large"}
df["size_label"] = df["size_code"].map(size_map)
# Vectorized operations (faster than apply — prefer when possible)
df["margin"] = (df["revenue"] - df["cost"]) / df["revenue"]
df["revenue_log"] = np.log(df["revenue"].clip(lower=1))
DateTime Operations
# Parse string columns as datetime
df["date"] = pd.to_datetime(df["date"])
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["date"] = pd.to_datetime(df["date"], errors="coerce") # NaT for invalid
# dt accessor — extract components
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["quarter"] = df["date"].dt.quarter
df["day_of_week"] = df["date"].dt.dayofweek # 0=Monday, 6=Sunday
df["week"] = df["date"].dt.isocalendar().week
# Format as string
df["date_str"] = df["date"].dt.strftime("%B %d, %Y") # "January 15, 2025"
df["month_year"] = df["date"].dt.to_period("M") # "2025-01"
# Time-based filtering
df[df["date"] >= "2024-01-01"]
df[df["date"].dt.year == 2024]
df[df["date"].dt.month.isin([10, 11, 12])] # Q4
# Resample: aggregate over time periods
df.set_index("date")["revenue"].resample("ME").sum() # monthly
df.set_index("date")["revenue"].resample("QE").mean() # quarterly
df.set_index("date")["revenue"].resample("YE").sum() # annual
# (Note: "ME" = month end, "QE" = quarter end, "YE" = year end in pandas 2.2+)
# Date arithmetic
df["days_since_first"] = (df["date"] - df["date"].min()).dt.days
df["next_renewal"] = df["date"] + pd.DateOffset(months=12)
Saving Data
To CSV
df.to_csv("output.csv", index=False) # don't write index column
df.to_csv("output.csv", index=False, encoding="utf-8")
df.to_csv("output.csv.gz", compression="gzip") # compressed
To Excel
df.to_excel("output.xlsx", index=False, sheet_name="Sales")
# Multiple sheets
with pd.ExcelWriter("report.xlsx", engine="openpyxl") as writer:
sales_df.to_excel(writer, sheet_name="Sales", index=False)
cost_df.to_excel(writer, sheet_name="Costs", index=False)
summary_df.to_excel(writer, sheet_name="Summary", index=False)
# With formatting
from openpyxl.styles import Font, PatternFill
with pd.ExcelWriter("styled.xlsx", engine="openpyxl") as writer:
df.to_excel(writer, sheet_name="Data", index=False)
ws = writer.sheets["Data"]
for cell in ws[1]:
cell.font = Font(bold=True)
To JSON
df.to_json("output.json", orient="records", indent=2)
# orient options: "records" (list of dicts), "split", "index", "columns"
To SQL
import sqlite3
conn = sqlite3.connect("database.sqlite")
df.to_sql("sales", conn, if_exists="replace", index=False)
df.to_sql("sales", conn, if_exists="append", index=False)
conn.close()
To Clipboard
df.to_clipboard(index=False) # paste into Excel/Sheets
Performance Tips
# Use categorical dtype for low-cardinality string columns (saves memory)
df["region"] = df["region"].astype("category")
df["status"] = df["status"].astype("category")
# Check memory usage
df.memory_usage(deep=True)
# Read large files in chunks
chunks = []
for chunk in pd.read_csv("large_file.csv", chunksize=10_000):
chunks.append(process(chunk))
result = pd.concat(chunks, ignore_index=True)
# Avoid iterrows() — it is very slow
# Instead, use vectorized operations or apply()
# Use query() for readable filtering (often slightly faster too)
df.query("revenue > 10000 and region == 'North'")
# inplace=True modifies the DataFrame without returning a copy
# Use it deliberately — it can be confusing
df.sort_values("date", inplace=True)
Common Gotchas
# 1. SettingWithCopyWarning
# Don't modify a slice of a DataFrame
bad = df[df["region"] == "North"]
bad["new_col"] = 1 # might not work — may be a copy
# Instead, use .copy()
north = df[df["region"] == "North"].copy()
north["new_col"] = 1 # safe
# Or use loc with a boolean mask
df.loc[df["region"] == "North", "new_col"] = 1 # safest
# 2. Chained assignment
df["col"][0] = 1 # unreliable
df.loc[0, "col"] = 1 # correct
# 3. & vs and in boolean indexing
df[df["a"] > 1 and df["b"] < 10] # TypeError
df[(df["a"] > 1) & (df["b"] < 10)] # correct
# 4. reset_index after groupby
result = df.groupby("region")["revenue"].sum() # region is the index
result.reset_index() # make region a regular column again
# 5. axis confusion
df.drop(columns=["col"]) # axis=1 (columns)
df.drop([0, 1, 2]) # axis=0 (rows, by index label)
df.apply(func, axis=1) # axis=1 = apply to each row
df.apply(func, axis=0) # axis=0 = apply to each column
# 6. Integer vs float NaN
# NaN forces float dtype in integer columns (pandas 1.x)
# Use pd.Int64Dtype() for nullable integers
df["count"] = df["count"].astype(pd.Int64Dtype())
For the complete pandas API reference, see pandas.pydata.org/docs. For Python language reference, see Appendix A.