# Standard data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('seaborn-v0_8')# CSV files
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', index_col=0) # Use first column as index
df = pd.read_csv('data.csv', parse_dates=['date_column']) # Parse dates
# Excel files
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# JSON files
df = pd.read_json('data.json')# Basic info
print(df.shape) # Dimensions
print(df.info()) # Data types and null counts
print(df.describe()) # Statistical summary
print(df.head()) # First 5 rows
print(df.tail()) # Last 5 rows
# Null values
print(df.isnull().sum()) # Count nulls per column
print(df.isnull().sum().sum()) # Total null count
df.dropna() # Remove rows with nulls
df.fillna(0) # Fill nulls with 0# Select columns
df['column_name'] # Single column
df[['col1', 'col2']] # Multiple columns
# Filter rows
df[df['column'] > 5] # Conditional filtering
df[df['column'].isin(['A', 'B'])] # Filter by list of values
df[(df['col1'] > 5) & (df['col2'] < 10)] # Multiple conditions
# Boolean indexing
mask = df['column'] > df['column'].mean()
df[mask]# Remove duplicates
df.drop_duplicates()
df.drop_duplicates(subset=['column1']) # Based on specific column
# Handle missing values
df.dropna(axis=0) # Drop rows with any null
df.dropna(axis=1) # Drop columns with any null
df.fillna(method='ffill') # Forward fill
df.fillna(df.mean()) # Fill with column mean
# Data type conversion
df['column'] = df['column'].astype('int64')
df['date'] = pd.to_datetime(df['date'])# Group by operations
df.groupby('category').mean() # Mean by group
df.groupby('category').size() # Count by group
df.groupby('category').agg({'col1': 'sum', 'col2': 'mean'}) # Multiple aggregations
# Pivot tables
pd.pivot_table(df, values='value', index='row_var', columns='col_var', aggfunc='mean')# Simple calculations
df['new_col'] = df['col1'] + df['col2']
df['percentage'] = df['part'] / df['total'] * 100
# Conditional columns
df['category'] = df['score'].apply(lambda x: 'High' if x > 80 else 'Low')
df['grade'] = np.where(df['score'] > 90, 'A',
np.where(df['score'] > 80, 'B', 'C'))# Quick plots
df['column'].hist() # Histogram
df['column'].plot(kind='box') # Box plot
df.plot(x='col1', y='col2', kind='scatter') # Scatter plot
# Matplotlib basics
plt.figure(figsize=(10, 6))
plt.plot(df['x'], df['y'])
plt.xlabel('X Label')
plt.ylabel('Y Label')
plt.title('Title')
plt.show()
# Seaborn basics
sns.histplot(data=df, x='column')
sns.boxplot(data=df, x='category', y='value')
sns.scatterplot(data=df, x='col1', y='col2', hue='category')# Descriptive statistics
df['column'].mean() # Mean
df['column'].median() # Median
df['column'].std() # Standard deviation
df['column'].quantile([0.25, 0.5, 0.75]) # Quartiles
# Correlation
df.corr() # Correlation matrix
df['col1'].corr(df['col2']) # Correlation between two columns# Save to CSV
df.to_csv('output.csv', index=False)
# Save to Excel
df.to_excel('output.xlsx', index=False)
# Save specific columns
df[['col1', 'col2']].to_csv('subset.csv', index=False)print(df.dtypes)
print(type(df['column'].iloc[0]))try:
result = df['column'].astype('int')
except ValueError as e:
print(f"Error converting to int: {e}")
# Handle the error appropriately# Check for expected values
assert df['column'].min() >= 0, "Negative values found"
assert not df['column'].isnull().any(), "Null values found"
# Check data shape
expected_columns = ['col1', 'col2', 'col3']
assert all(col in df.columns for col in expected_columns), "Missing columns"# 1. Import libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# 2. Prepare data
X = df[['feature1', 'feature2']] # Features
y = df['target'] # Target variable
# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 4. Train model
model = LinearRegression()
model.fit(X_train, y_train)
# 5. Make predictions
y_pred = model.predict(X_test)
# 6. Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}, R²: {r2:.2f}")# Create polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
# Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(f"CV Score: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")
# Confusion matrix (classification)
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))- Always explore your data first: Use
.head(),.info(),.describe() - Handle missing values explicitly: Don't ignore them
- Use meaningful variable names:
student_scoresnotdata - Comment your code: Explain the why, not just the what
- Save intermediate results: Don't recalculate everything each time
- Use version control: Track your changes with git
- Document your assumptions: What does your analysis assume?
- Validate your results: Do they make sense?
- Data leakage: Don't use future information to predict the past
- Overfitting: Your model works too well on training data
- Underfitting: Your model is too simple
- Not splitting data: Always have separate training and test sets
- Ignoring data quality: Clean data is crucial
- Not checking assumptions: Understand your model's requirements
- Correlation vs. causation: Correlation doesn't imply causation
- Not validating results: Always double-check your findings
This cheat sheet covers the most common patterns you'll use in data science. Keep it handy as you work through real projects!