Interview Practice - Quant Analytics

9. Sample SQL Interview Questions

Question 1: Customer Transaction Analysis

Problem: Find customers who made transactions in 3 consecutive months.

WITH monthly_transactions AS (
    SELECT
        customer_id,
        DATE_TRUNC('month', transaction_date) AS month,
        COUNT(*) AS transaction_count
    FROM transactions
    GROUP BY customer_id, DATE_TRUNC('month', transaction_date)
),
consecutive_months AS (
    SELECT
        customer_id,
        month,
        LAG(month, 1) OVER (PARTITION BY customer_id ORDER BY month) AS prev_month_1,
        LAG(month, 2) OVER (PARTITION BY customer_id ORDER BY month) AS prev_month_2
    FROM monthly_transactions
)
SELECT DISTINCT customer_id
FROM consecutive_months
WHERE month = prev_month_1 + INTERVAL '1 month'
  AND prev_month_1 = prev_month_2 + INTERVAL '1 month';

Question 2: Agent Performance Metrics

Problem: Calculate average handle time, first call resolution rate, and customer satisfaction by agent.

SELECT
    agent_id,
    agent_name,
    COUNT(call_id) AS total_calls,
    AVG(handle_time_seconds) AS avg_handle_time,
    SUM(CASE WHEN first_call_resolution = 'Y' THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS fcr_rate,
    AVG(customer_satisfaction_score) AS avg_csat,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY handle_time_seconds) AS median_handle_time
FROM calls c
JOIN agents a ON c.agent_id = a.agent_id
WHERE call_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY agent_id, agent_name
HAVING COUNT(call_id) >= 50
ORDER BY fcr_rate DESC;

Question 3: Cohort Retention Analysis

Problem: Calculate monthly retention rates for customer cohorts.

WITH cohorts AS (
    SELECT
        customer_id,
        DATE_TRUNC('month', MIN(signup_date)) AS cohort_month
    FROM customers
    GROUP BY customer_id
),
customer_activity AS (
    SELECT DISTINCT
        customer_id,
        DATE_TRUNC('month', activity_date) AS activity_month
    FROM transactions
)
SELECT
    c.cohort_month,
    a.activity_month,
    EXTRACT(MONTH FROM AGE(a.activity_month, c.cohort_month)) AS months_since_signup,
    COUNT(DISTINCT a.customer_id) AS active_customers,
    COUNT(DISTINCT a.customer_id) * 100.0 /
        (SELECT COUNT(*) FROM cohorts WHERE cohort_month = c.cohort_month) AS retention_rate
FROM cohorts c
JOIN customer_activity a ON c.customer_id = a.customer_id
GROUP BY c.cohort_month, a.activity_month
ORDER BY c.cohort_month, months_since_signup;

13. Advanced SQL Interview Questions

Question 4: Running Totals and Moving Averages

Problem: Calculate running total of sales and 7-day moving average by product.

SELECT
    product_id,
    sale_date,
    daily_sales,
    SUM(daily_sales) OVER (
        PARTITION BY product_id
        ORDER BY sale_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS running_total,
    AVG(daily_sales) OVER (
        PARTITION BY product_id
        ORDER BY sale_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS moving_avg_7day,
    daily_sales - LAG(daily_sales, 7) OVER (
        PARTITION BY product_id
        ORDER BY sale_date
    ) AS week_over_week_change
FROM daily_product_sales
ORDER BY product_id, sale_date;

Question 5: Self-Join for Finding Gaps

Problem: Find gaps in customer account numbers (missing IDs).

WITH account_numbers AS (
    SELECT DISTINCT account_id
    FROM accounts
    ORDER BY account_id
),
gaps AS (
    SELECT
        a1.account_id + 1 AS gap_start,
        MIN(a2.account_id) - 1 AS gap_end
    FROM account_numbers a1
    LEFT JOIN account_numbers a2 ON a2.account_id > a1.account_id
    WHERE NOT EXISTS (
        SELECT 1
        FROM account_numbers a3
        WHERE a3.account_id = a1.account_id + 1
    )
    GROUP BY a1.account_id
    HAVING MIN(a2.account_id) IS NOT NULL
)
SELECT gap_start, gap_end,
       gap_end - gap_start + 1 AS gap_size
FROM gaps
WHERE gap_start <= gap_end;

Question 6: Hierarchical Queries (Manager-Employee)

Problem: Find all employees in a manager's reporting chain with their level.

WITH RECURSIVE employee_hierarchy AS (
    -- Anchor: Start with the manager
    SELECT
        employee_id,
        employee_name,
        manager_id,
        1 AS level,
        CAST(employee_name AS VARCHAR(1000)) AS path
    FROM employees
    WHERE manager_id IS NULL  -- Top-level manager

    UNION ALL

    -- Recursive: Get all reports
    SELECT
        e.employee_id,
        e.employee_name,
        e.manager_id,
        eh.level + 1,
        CAST(eh.path || ' -> ' || e.employee_name AS VARCHAR(1000))
    FROM employees e
    INNER JOIN employee_hierarchy eh ON e.manager_id = eh.employee_id
)
SELECT
    employee_id,
    employee_name,
    level,
    path,
    REPEAT('  ', level - 1) || employee_name AS indented_name
FROM employee_hierarchy
ORDER BY path;

Question 7: Complex Aggregation - Pivoting Data

Problem: Pivot transaction data to show monthly sales by product category.

SELECT
    product_category,
    SUM(CASE WHEN EXTRACT(MONTH FROM transaction_date) = 1 THEN amount ELSE 0 END) AS jan_sales,
    SUM(CASE WHEN EXTRACT(MONTH FROM transaction_date) = 2 THEN amount ELSE 0 END) AS feb_sales,
    SUM(CASE WHEN EXTRACT(MONTH FROM transaction_date) = 3 THEN amount ELSE 0 END) AS mar_sales,
    SUM(CASE WHEN EXTRACT(MONTH FROM transaction_date) = 4 THEN amount ELSE 0 END) AS apr_sales,
    SUM(CASE WHEN EXTRACT(MONTH FROM transaction_date) = 5 THEN amount ELSE 0 END) AS may_sales,
    SUM(CASE WHEN EXTRACT(MONTH FROM transaction_date) = 6 THEN amount ELSE 0 END) AS jun_sales,
    SUM(amount) AS total_sales,
    COUNT(DISTINCT customer_id) AS unique_customers
FROM transactions
WHERE EXTRACT(YEAR FROM transaction_date) = 2024
GROUP BY product_category
ORDER BY total_sales DESC;

Question 8: Performance Optimization

Problem: Optimize a slow query that joins multiple large tables.

Before (Slow):

-- Slow version
SELECT *
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
JOIN order_items oi ON o.order_id = oi.order_id
WHERE o.order_date >= '2024-01-01';

After (Optimized):

-- Optimized version
WITH recent_orders AS (
    SELECT order_id, customer_id, order_date
    FROM orders
    WHERE order_date >= '2024-01-01'  -- Filter early
    AND status != 'cancelled'
),
order_summary AS (
    SELECT
        ro.customer_id,
        COUNT(DISTINCT ro.order_id) AS order_count,
        SUM(oi.quantity * oi.unit_price) AS total_value
    FROM recent_orders ro
    JOIN order_items oi ON ro.order_id = oi.order_id
    GROUP BY ro.customer_id
)
SELECT
    c.customer_id,
    c.customer_name,
    c.email,
    COALESCE(os.order_count, 0) AS order_count,
    COALESCE(os.total_value, 0) AS total_value
FROM customers c
LEFT JOIN order_summary os ON c.customer_id = os.customer_id
WHERE c.status = 'active';

-- Key optimizations:
-- 1. Filter early (WHERE on indexed column)
-- 2. Use CTEs for readability
-- 3. Aggregate before joining to reduce rows
-- 4. Select only needed columns (not SELECT *)
-- 5. Ensure indexes on join columns and WHERE conditions

Question 9: Detecting Fraud Patterns

Problem: Identify suspicious transaction patterns (multiple transactions in short time from different locations).

WITH transaction_pairs AS (
    SELECT
        t1.customer_id,
        t1.transaction_id AS trans_1,
        t2.transaction_id AS trans_2,
        t1.transaction_time AS time_1,
        t2.transaction_time AS time_2,
        t1.location AS location_1,
        t2.location AS location_2,
        EXTRACT(EPOCH FROM (t2.transaction_time - t1.transaction_time))/60 AS minutes_apart,
        t1.amount + t2.amount AS combined_amount
    FROM transactions t1
    JOIN transactions t2
        ON t1.customer_id = t2.customer_id
        AND t1.transaction_id < t2.transaction_id
        AND t2.transaction_time BETWEEN t1.transaction_time
                                    AND t1.transaction_time + INTERVAL '30 minutes'
    WHERE t1.location != t2.location
)
SELECT
    customer_id,
    COUNT(*) AS suspicious_pairs,
    SUM(combined_amount) AS total_suspicious_amount,
    MIN(minutes_apart) AS fastest_transaction_minutes,
    ARRAY_AGG(DISTINCT location_1 || ' -> ' || location_2) AS location_pairs
FROM transaction_pairs
WHERE minutes_apart <= 15  -- Transactions less than 15 min apart
GROUP BY customer_id
HAVING COUNT(*) >= 2
ORDER BY suspicious_pairs DESC, total_suspicious_amount DESC;

10. Sample Python Interview Questions

Question 1: Data Cleaning Pipeline

Problem: Build a function to clean and prepare customer data for analysis.

import pandas as pd
import numpy as np

def clean_customer_data(df):
    """
    Clean customer data by handling missing values, outliers,
    and standardizing formats.
    """
    # Create a copy to avoid modifying original
    df_clean = df.copy()

    # Handle missing values
    df_clean['email'].fillna('unknown@email.com', inplace=True)
    df_clean['age'].fillna(df_clean['age'].median(), inplace=True)

    # Remove outliers using IQR method
    Q1 = df_clean['transaction_amount'].quantile(0.25)
    Q3 = df_clean['transaction_amount'].quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[
        (df_clean['transaction_amount'] >= Q1 - 1.5 * IQR) &
        (df_clean['transaction_amount'] <= Q3 + 1.5 * IQR)
    ]

    # Standardize date formats
    df_clean['signup_date'] = pd.to_datetime(df_clean['signup_date'])

    # Create derived features
    df_clean['tenure_days'] = (pd.Timestamp.now() - df_clean['signup_date']).dt.days
    df_clean['age_group'] = pd.cut(df_clean['age'],
                                     bins=[0, 25, 40, 60, 100],
                                     labels=['18-25', '26-40', '41-60', '60+'])

    return df_clean

Question 2: A/B Test Analysis

Problem: Analyze results of an A/B test and determine statistical significance.

import pandas as pd
from scipy import stats

def analyze_ab_test(control_group, treatment_group, metric='conversion_rate'):
    """
    Perform statistical analysis on A/B test results.
    """
    # Calculate conversion rates
    control_conversions = control_group['converted'].sum()
    control_total = len(control_group)
    control_rate = control_conversions / control_total

    treatment_conversions = treatment_group['converted'].sum()
    treatment_total = len(treatment_group)
    treatment_rate = treatment_conversions / treatment_total

    # Perform two-proportion z-test
    successes = np.array([control_conversions, treatment_conversions])
    samples = np.array([control_total, treatment_total])

    stat, p_value = stats.proportions_ztest(successes, samples)

    # Calculate confidence interval for difference
    pooled_prob = (control_conversions + treatment_conversions) / (control_total + treatment_total)
    se = np.sqrt(pooled_prob * (1 - pooled_prob) * (1/control_total + 1/treatment_total))
    ci_lower = (treatment_rate - control_rate) - 1.96 * se
    ci_upper = (treatment_rate - control_rate) + 1.96 * se

    results = {
        'control_rate': control_rate,
        'treatment_rate': treatment_rate,
        'lift': (treatment_rate - control_rate) / control_rate * 100,
        'p_value': p_value,
        'significant': p_value < 0.05,
        'ci_95': (ci_lower, ci_upper)
    }

    return results

Question 3: Feature Engineering for ML

Problem: Create features for a churn prediction model.

def engineer_churn_features(transactions_df, customers_df):
    """
    Create features for customer churn prediction.
    """
    # Aggregate transaction features
    customer_features = transactions_df.groupby('customer_id').agg({
        'transaction_amount': ['sum', 'mean', 'std', 'count'],
        'transaction_date': ['min', 'max']
    }).reset_index()

    # Flatten column names
    customer_features.columns = ['_'.join(col).strip('_')
                                   for col in customer_features.columns.values]

    # Calculate recency, frequency, monetary (RFM)
    reference_date = pd.Timestamp.now()
    customer_features['recency_days'] = (
        reference_date - pd.to_datetime(customer_features['transaction_date_max'])
    ).dt.days
    customer_features['tenure_days'] = (
        reference_date - pd.to_datetime(customer_features['transaction_date_min'])
    ).dt.days
    customer_features['frequency'] = customer_features['transaction_amount_count']
    customer_features['monetary'] = customer_features['transaction_amount_sum']

    # Calculate transaction trends
    recent_30d = transactions_df[
        transactions_df['transaction_date'] >= reference_date - pd.Timedelta(days=30)
    ]
    recent_activity = recent_30d.groupby('customer_id').agg({
        'transaction_amount': 'sum'
    }).reset_index()
    recent_activity.columns = ['customer_id', 'amount_last_30d']

    customer_features = customer_features.merge(
        recent_activity, on='customer_id', how='left'
    )
    customer_features['amount_last_30d'].fillna(0, inplace=True)

    # Join with customer demographics
    final_features = customer_features.merge(customers_df, on='customer_id')

    return final_features

14. Advanced Python Interview Questions

Question 4: Time Series Decomposition

Problem: Decompose time series into trend, seasonality, and residuals.

import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

def analyze_time_series(df, date_col, value_col, period=7):
    """
    Decompose time series and detect anomalies.
    """
    # Ensure datetime index
    df = df.sort_values(date_col)
    df.set_index(date_col, inplace=True)

    # Decomposition
    decomposition = seasonal_decompose(
        df[value_col],
        model='additive',  # or 'multiplicative'
        period=period
    )

    # Extract components
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid

    # Detect anomalies using residuals
    residual_std = residual.std()
    threshold = 3 * residual_std
    anomalies = df[np.abs(residual) > threshold].copy()
    anomalies['residual'] = residual

    # Calculate metrics
    metrics = {
        'trend_direction': 'increasing' if trend.iloc[-1] > trend.iloc[0] else 'decreasing',
        'avg_seasonal_variation': seasonal.std(),
        'anomaly_count': len(anomalies),
        'coefficient_of_variation': df[value_col].std() / df[value_col].mean()
    }

    return {
        'trend': trend,
        'seasonal': seasonal,
        'residual': residual,
        'anomalies': anomalies,
        'metrics': metrics
    }

Question 5: Customer Segmentation with K-Means

Problem: Segment customers using RFM analysis and K-means clustering.

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def customer_segmentation(transactions_df):
    """
    Perform RFM analysis and K-means clustering.
    """
    reference_date = transactions_df['transaction_date'].max()

    # Calculate RFM
    rfm = transactions_df.groupby('customer_id').agg({
        'transaction_date': lambda x: (reference_date - x.max()).days,  # Recency
        'transaction_id': 'count',  # Frequency
        'amount': 'sum'  # Monetary
    })

    rfm.columns = ['recency', 'frequency', 'monetary']

    # Add additional features
    rfm['avg_transaction_value'] = rfm['monetary'] / rfm['frequency']

    # Handle outliers with log transformation
    rfm['log_monetary'] = np.log1p(rfm['monetary'])
    rfm['log_frequency'] = np.log1p(rfm['frequency'])

    # Standardize features
    features = ['recency', 'log_frequency', 'log_monetary']
    scaler = StandardScaler()
    rfm_scaled = scaler.fit_transform(rfm[features])

    # Find optimal number of clusters using elbow method
    inertias = []
    K_range = range(2, 11)
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(rfm_scaled)
        inertias.append(kmeans.inertia_)

    # Apply K-means with optimal k (let's say 4)
    optimal_k = 4
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    rfm['cluster'] = kmeans.fit_predict(rfm_scaled)

    # Profile each cluster
    cluster_profiles = rfm.groupby('cluster').agg({
        'recency': ['mean', 'median'],
        'frequency': ['mean', 'median'],
        'monetary': ['mean', 'median'],
        'customer_id': 'count'
    }).round(2)

    # Label clusters
    cluster_labels = {
        0: 'Champions',
        1: 'Loyal Customers',
        2: 'At Risk',
        3: 'Lost'
    }
    rfm['segment'] = rfm['cluster'].map(cluster_labels)

    return rfm, cluster_profiles

Question 6: ETL Pipeline with Error Handling

Problem: Build a robust ETL pipeline with logging and error handling.

import pandas as pd
import logging
from datetime import datetime
from typing import Dict, List
import sys

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'etl_pipeline_{datetime.now().strftime("%Y%m%d")}.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

class DataPipeline:
    def __init__(self, config: Dict):
        self.config = config
        self.metrics = {
            'rows_extracted': 0,
            'rows_transformed': 0,
            'rows_loaded': 0,
            'errors': []
        }

    def extract(self, source_path: str) -> pd.DataFrame:
        """Extract data from source."""
        try:
            logger.info(f"Extracting data from {source_path}")
            df = pd.read_csv(source_path)
            self.metrics['rows_extracted'] = len(df)
            logger.info(f"Successfully extracted {len(df)} rows")
            return df
        except Exception as e:
            logger.error(f"Extraction failed: {str(e)}")
            self.metrics['errors'].append(f"Extraction: {str(e)}")
            raise

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform and clean data."""
        try:
            logger.info("Starting data transformation")
            initial_rows = len(df)

            # Validation
            required_columns = self.config.get('required_columns', [])
            missing_cols = set(required_columns) - set(df.columns)
            if missing_cols:
                raise ValueError(f"Missing required columns: {missing_cols}")

            # Data quality checks
            duplicates = df.duplicated().sum()
            if duplicates > 0:
                logger.warning(f"Removing {duplicates} duplicate rows")
                df = df.drop_duplicates()

            # Handle missing values
            null_counts = df.isnull().sum()
            for col, null_count in null_counts[null_counts > 0].items():
                logger.info(f"Column '{col}' has {null_count} null values")

                if col in self.config.get('numeric_columns', []):
                    df[col].fillna(df[col].median(), inplace=True)
                elif col in self.config.get('categorical_columns', []):
                    df[col].fillna('Unknown', inplace=True)

            # Type conversions
            for col, dtype in self.config.get('column_types', {}).items():
                if col in df.columns:
                    df[col] = df[col].astype(dtype)

            # Business logic transformations
            if 'transaction_date' in df.columns:
                df['transaction_date'] = pd.to_datetime(df['transaction_date'])
                df['year'] = df['transaction_date'].dt.year
                df['month'] = df['transaction_date'].dt.month
                df['day_of_week'] = df['transaction_date'].dt.dayofweek

            # Outlier handling
            for col in self.config.get('outlier_columns', []):
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR

                outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
                if outliers > 0:
                    logger.warning(f"Found {outliers} outliers in '{col}'")
                    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

            self.metrics['rows_transformed'] = len(df)
            rows_removed = initial_rows - len(df)
            logger.info(f"Transformation complete. Removed {rows_removed} rows")

            return df

        except Exception as e:
            logger.error(f"Transformation failed: {str(e)}")
            self.metrics['errors'].append(f"Transformation: {str(e)}")
            raise

    def load(self, df: pd.DataFrame, destination_path: str):
        """Load data to destination."""
        try:
            logger.info(f"Loading data to {destination_path}")
            df.to_csv(destination_path, index=False)
            self.metrics['rows_loaded'] = len(df)
            logger.info(f"Successfully loaded {len(df)} rows")
        except Exception as e:
            logger.error(f"Load failed: {str(e)}")
            self.metrics['errors'].append(f"Load: {str(e)}")
            raise

    def run(self, source_path: str, destination_path: str) -> Dict:
        """Execute full ETL pipeline."""
        start_time = datetime.now()
        logger.info("=" * 50)
        logger.info("Starting ETL Pipeline")
        logger.info("=" * 50)

        try:
            # ETL steps
            df = self.extract(source_path)
            df_transformed = self.transform(df)
            self.load(df_transformed, destination_path)

            # Calculate metrics
            duration = (datetime.now() - start_time).total_seconds()
            self.metrics['duration_seconds'] = duration
            self.metrics['success'] = True

            logger.info("=" * 50)
            logger.info("ETL Pipeline Completed Successfully")
            logger.info(f"Duration: {duration:.2f} seconds")
            logger.info(f"Rows: {self.metrics['rows_extracted']} → {self.metrics['rows_loaded']}")
            logger.info("=" * 50)

        except Exception as e:
            self.metrics['success'] = False
            logger.error("ETL Pipeline Failed")
            logger.error(str(e))

        return self.metrics

# Usage
config = {
    'required_columns': ['customer_id', 'transaction_date', 'amount'],
    'numeric_columns': ['amount'],
    'categorical_columns': ['product_category'],
    'column_types': {'customer_id': str},
    'outlier_columns': ['amount']
}

pipeline = DataPipeline(config)
results = pipeline.run('source_data.csv', 'processed_data.csv')

15. Probability & Statistics Problems

Problem 1: Expected Value

Question: A credit card company offers cashback: 3% on gas, 2% on groceries, 1% on everything else. Average customer spends $200/month on gas, $400 on groceries, $600 on other. What's the expected monthly cashback?

Solution:

Expected Cashback = (0.03 × $200) + (0.02 × $400) + (0.01 × $600)
                  = $6 + $8 + $6
                  = $20/month

Annual expected cashback = $20 × 12 = $240

Problem 2: Bayes' Theorem (Fraud Detection)

Question: 0.1% of transactions are fraudulent. Your fraud detection model has 95% sensitivity (true positive rate) and 98% specificity (true negative rate). If a transaction is flagged as fraud, what's the probability it's actually fraudulent?

Solution:

Given:
- P(Fraud) = 0.001
- P(Not Fraud) = 0.999
- P(Flagged | Fraud) = 0.95 (sensitivity)
- P(Not Flagged | Not Fraud) = 0.98 (specificity)
- P(Flagged | Not Fraud) = 0.02 (false positive rate)

Using Bayes' Theorem:
P(Fraud | Flagged) = P(Flagged | Fraud) × P(Fraud) / P(Flagged)

P(Flagged) = P(Flagged | Fraud) × P(Fraud) + P(Flagged | Not Fraud) × P(Not Fraud)
           = (0.95 × 0.001) + (0.02 × 0.999)
           = 0.00095 + 0.01998
           = 0.02093

P(Fraud | Flagged) = (0.95 × 0.001) / 0.02093
                   = 0.00095 / 0.02093
                   = 0.0454 or 4.54%

Insight: Even with a good model, only 4.54% of flagged transactions are
actually fraudulent due to the low base rate (0.1%).

Problem 3: Central Limit Theorem

Question: Average customer wait time is 5 minutes with standard deviation of 2 minutes. What's the probability that the average wait time for 100 customers exceeds 5.3 minutes?

Solution:

Given:
- μ = 5 minutes
- σ = 2 minutes
- n = 100 customers
- Sample mean distribution: μ_x̄ = 5, σ_x̄ = σ/√n = 2/√100 = 0.2

Calculate z-score:
z = (x̄ - μ_x̄) / σ_x̄ = (5.3 - 5) / 0.2 = 1.5

P(X̄ > 5.3) = P(Z > 1.5) = 1 - Φ(1.5) = 1 - 0.9332 = 0.0668 or 6.68%

Conclusion: There's about a 6.68% chance the average wait time exceeds 5.3 minutes.

Problem 4: Statistical Power

Question: You want to detect a 5% improvement in conversion rate (from 10% to 10.5%) with 80% power and 95% confidence. How many samples do you need per group?

import scipy.stats as stats
import numpy as np

def sample_size_proportions(p1, p2, alpha=0.05, power=0.80):
    """
    Calculate required sample size for two-proportion test.
    """
    # Z-scores
    z_alpha = stats.norm.ppf(1 - alpha/2)  # Two-tailed
    z_beta = stats.norm.ppf(power)

    # Pooled proportion
    p_avg = (p1 + p2) / 2

    # Sample size formula
    n = (z_alpha * np.sqrt(2 * p_avg * (1 - p_avg)) +
         z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2)))**2 / (p1 - p2)**2

    return int(np.ceil(n))

p1 = 0.10  # Control conversion rate
p2 = 0.105  # Treatment conversion rate (5% relative improvement)

n = sample_size_proportions(p1, p2)
print(f"Required sample size per group: {n:,}")
# Output: Required sample size per group: ~15,686

# For 5% absolute improvement (10% to 15%):
n_large_effect = sample_size_proportions(0.10, 0.15)
print(f"For 5 percentage point improvement: {n_large_effect:,}")
# Output: ~385 per group

Problem 5: Confidence Intervals

Question: Sample of 500 customers shows average transaction value of $75 with standard deviation of $20. Calculate 95% confidence interval.

Given:
- n = 500
- x̄ = $75
- s = $20
- Confidence level = 95%

Standard error: SE = s/√n = 20/√500 = 0.894

For n > 30, use z-distribution:
z = 1.96 for 95% confidence

Margin of error: ME = z × SE = 1.96 × 0.894 = 1.75

95% CI = x̄ ± ME = $75 ± $1.75 = ($73.25, $76.75)

Interpretation: We're 95% confident the true average transaction value
for all customers is between $73.25 and $76.75.

16. Machine Learning Interview Questions

Q1: Explain bias-variance tradeoff

Answer: The bias-variance tradeoff is the balance between two sources of model error:

Bias: Error from overly simplistic assumptions. High bias → underfitting (model too simple, misses patterns). Example: linear regression for non-linear data.
Variance: Error from sensitivity to training data fluctuations. High variance → overfitting (model too complex, captures noise). Example: deep decision tree.
Tradeoff: Total Error = Bias² + Variance + Irreducible Error
- Simple models: High bias, low variance
- Complex models: Low bias, high variance
- Goal: Find sweet spot that minimizes total error
Mitigation: Cross-validation, regularization (L1/L2), ensemble methods, feature engineering

Q2: When would you use Random Forest vs Gradient Boosting?

Answer:

Random Forest (Bagging):
- Trains trees in parallel, each on random sample of data/features
- Reduces variance through averaging
- More robust to outliers and noise
- Faster to train (parallelizable)
- Less prone to overfitting
- Use when: Need robustness, have noisy data, want faster training, need feature importance
Gradient Boosting (Boosting):
- Trains trees sequentially, each correcting previous errors
- Reduces bias through iterative improvement
- Often higher accuracy
- Slower to train (sequential)
- More prone to overfitting, needs careful tuning
- Use when: Need maximum accuracy, have clean data, willing to tune hyperparameters
Banking Example:
- Fraud detection → Random Forest (need robustness, speed for real-time)
- Credit scoring → Gradient Boosting (accuracy critical, offline training acceptable)

Q3: How do you handle imbalanced classes?

Answer: Imbalanced classes (e.g., 99% non-fraud, 1% fraud) require special handling:

1. Resampling:
- Oversampling minority class (SMOTE creates synthetic samples)
- Undersampling majority class (risks losing information)
- Hybrid approaches
2. Algorithm-level:
- Class weights (penalize errors on minority class more)
- Threshold adjustment (lower threshold for minority class)
- Anomaly detection algorithms (Isolation Forest, One-Class SVM)
3. Evaluation metrics:
- Don't use accuracy (misleading with imbalance)
- Use: Precision, Recall, F1-score, ROC-AUC, PR-AUC
- Focus on minority class performance
4. Ensemble methods:
- EasyEnsemble, BalancedRandomForest
- Train multiple models on different balanced subsets
Banking Example: Fraud detection with 0.1% fraud rate
- Use class_weight='balanced' in sklearn
- Optimize for high recall (catch frauds) with acceptable precision
- Use PR-AUC as primary metric

Q4: Feature selection techniques

Answer:

Filter Methods (Statistical tests, before modeling):
- Correlation coefficient (remove highly correlated features)
- Chi-square test (categorical features)
- Mutual information (captures non-linear relationships)
- Variance threshold (remove low-variance features)
Wrapper Methods (Use model performance):
- Recursive Feature Elimination (RFE): Iteratively remove least important features
- Forward selection: Start empty, add best features one by one
- Backward elimination: Start with all, remove worst one by one
Embedded Methods (Built into algorithm):
- L1 regularization (Lasso): Shrinks coefficients to zero
- Tree-based importance: Random Forest, XGBoost feature importance
- ElasticNet: Combines L1 and L2
When to use:
- High dimensionality (curse of dimensionality)
- Reduce overfitting
- Improve interpretability
- Faster training and inference

Q5: Explain regularization

Answer: Regularization adds a penalty term to the loss function to prevent overfitting:

Loss = Data Loss + λ × Regularization Term

L1 (Lasso): Loss = MSE + λ Σ|w_i|
- Penalty on absolute values of weights
- Produces sparse models (some weights become exactly 0)
- Good for feature selection
- Less stable with correlated features

L2 (Ridge): Loss = MSE + λ Σw_i²
- Penalty on squared weights
- Weights approach 0 but never exactly 0
- Better with correlated features
- Produces smaller, distributed weights

ElasticNet: Combines L1 and L2
- Loss = MSE + λ₁ Σ|w_i| + λ₂ Σw_i²
- Benefits of both approaches

λ (lambda): Regularization strength
- λ = 0: No regularization (may overfit)
- λ → ∞: Strong regularization (may underfit)
- Find optimal λ via cross-validation

Practical example: Predicting customer churn with 50 features, many correlated (age/tenure, income/balance). Use ElasticNet to handle correlations while selecting most important features.

17. Time Series Analysis

Core Concepts

Components:
- Trend: Long-term increase or decrease
- Seasonality: Regular patterns (daily, weekly, yearly)
- Cyclical: Irregular fluctuations (business cycles)
- Residual/Noise: Random variation
Stationarity: Statistical properties constant over time
- Tests: Augmented Dickey-Fuller (ADF), KPSS
- Make stationary: Differencing, detrending, log transform
Autocorrelation: Correlation with lagged values
- ACF (Autocorrelation Function): Direct correlation
- PACF (Partial ACF): Correlation after removing effects of intermediate lags

Time Series Models

ARIMA (AutoRegressive Integrated Moving Average):
- AR(p): Regression on past p values
- I(d): Differencing d times for stationarity
- MA(q): Regression on past q forecast errors
- Model selection: Use ACF/PACF plots, AIC/BIC criteria
SARIMA: ARIMA with seasonal components
Prophet (Facebook): Handles seasonality, holidays, trend changes
LSTM/GRU: Deep learning for complex patterns
Exponential Smoothing: Simple, interpretable forecasting

Banking Use Case: Call Volume Forecasting

import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

# Load call volume data
df = pd.read_csv('call_volumes.csv', parse_dates=['date'])
df.set_index('date', inplace=True)

# Decompose
decomposition = seasonal_decompose(df['calls'], model='additive', period=7)

# Check stationarity
from statsmodels.tsa.stattools import adfuller
result = adfuller(df['calls'])
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

# If not stationary, difference
if result[1] > 0.05:
    df['calls_diff'] = df['calls'].diff()
    df = df.dropna()

# Fit SARIMA model
# (p,d,q) x (P,D,Q,s)
# s=7 for weekly seasonality
model = SARIMAX(
    df['calls'],
    order=(1, 1, 1),  # (p, d, q)
    seasonal_order=(1, 1, 1, 7),  # (P, D, Q, s)
    enforce_stationarity=False,
    enforce_invertibility=False
)

results = model.fit()

# Forecast next 14 days
forecast = results.get_forecast(steps=14)
forecast_df = forecast.conf_int()
forecast_df['forecast'] = forecast.predicted_mean

# Business application
peak_days = forecast_df[forecast_df['forecast'] > forecast_df['forecast'].quantile(0.75)]
print(f"High volume days: {peak_days.index.tolist()}")
print("Recommended staffing: Increase by 15% on these days")

Evaluation Metrics

MAE (Mean Absolute Error): Average absolute difference, same units as data
RMSE (Root Mean Squared Error): Penalizes large errors more
MAPE (Mean Absolute Percentage Error): Scale-independent, interpretable as %
SMAPE (Symmetric MAPE): Better for data with zeros
AIC/BIC: Model selection (lower is better)

18. Experimentation & Causal Inference

A/B Testing Best Practices

Design Phase:
- Define hypothesis (null and alternative)
- Choose primary metric (guardrail metrics too)
- Calculate required sample size (power analysis)
- Determine test duration (avoid novelty effect)
- Randomization strategy (user-level vs session-level)
Running Phase:
- Monitor for sample ratio mismatch (SRM)
- Check for data quality issues early
- Don't peek! (or use sequential testing methods)
- Watch for external factors (holidays, outages)
Analysis Phase:
- Check assumptions (normality, equal variance)
- Statistical significance AND practical significance
- Segmentation analysis (did all segments benefit?)
- Long-term impact estimation

Common Pitfalls

Peeking: Looking at results before test completes inflates Type I error
- Solution: Use sequential testing (e.g., always valid inference)
Multiple comparisons: Testing many metrics increases false positives
- Solution: Bonferroni correction, FDR control, pre-specify primary metric
Network effects: Treatment spills over to control group
- Example: Referral program affects control users
- Solution: Cluster randomization, ego-network analysis
Novelty effect: Users try new feature just because it's new
- Solution: Run test longer (2-4 weeks minimum)
Survivorship bias: Only analyzing users who completed journey
- Solution: Intent-to-treat analysis

Advanced Techniques

CUPED (Controlled-Experiment Using Pre-Experiment Data):
- Reduces variance using pre-experiment covariates
- Can reduce required sample size by 30-50%
- Adjusted metric = Y - θ(X_pre - E[X_pre])
Multi-Armed Bandits:
- Adaptive allocation: shift traffic to better variants
- Reduce opportunity cost of experimentation
- Trade-off: Exploration vs Exploitation
Difference-in-Differences:
- Quasi-experimental design when randomization impossible
- Compare changes over time between treatment and control
- Assumes parallel trends
Synthetic Control:
- Create synthetic control group from weighted combination
- Useful for single-unit treatments (e.g., policy change in one region)

Case Study: Testing New Chatbot for Customer Service

Hypothesis: AI chatbot will reduce call center volume by 10%

Design:
- Randomization: 50/50 split at customer level
- Primary metric: Call volume per customer (30 days)
- Guardrail metrics: CSAT, issue resolution time, escalation rate
- Sample size: 10,000 per group (80% power, 5% significance)
- Duration: 4 weeks (account for weekly seasonality + novelty)

Potential issues:
1. Learning effect: Chatbot improves over time
   → Solution: Pre-train with historical data

2. Selection bias: Tech-savvy users prefer chatbot
   → Solution: Random assignment before showing chatbot option

3. Novelty effect: Users try chatbot out of curiosity
   → Solution: 4-week duration, analyze week-over-week trends

4. Metric dilution: Some users in treatment never see chatbot
   → Solution: Analyze both ITT (intent-to-treat) and TOT (treatment-on-treated)

Analysis:
- Primary: Two-sample t-test on call volume
- Segmentation: Analyze by issue type, customer tenure, previous contact frequency
- Long-term: Estimate sustained impact after novelty wears off

Decision framework:
- Ship if: >8% reduction, no guardrail degradation, positive across key segments
- Iterate if: 5-8% reduction or negative sentiment
- Kill if: <5% reduction or significant CSAT decline

20. Algorithms & Data Structures Basics

Essential Data Structures

Arrays/Lists: O(1) access, O(n) search, good for indexed access
Hash Tables/Dictionaries: O(1) average insert/lookup, perfect for counting, grouping
Sets: O(1) membership testing, removing duplicates
Queues: FIFO, BFS, task scheduling
Stacks: LIFO, DFS, undo operations, expression evaluation
Heaps: O(log n) insert/delete min/max, priority queues, top-K problems

Common Algorithm Patterns

# 1. Two Pointers
def remove_duplicates_sorted_array(arr):
    """Remove duplicates from sorted array in-place."""
    if not arr:
        return 0

    write_idx = 1
    for read_idx in range(1, len(arr)):
        if arr[read_idx] != arr[read_idx - 1]:
            arr[write_idx] = arr[read_idx]
            write_idx += 1

    return write_idx  # New length

# 2. Sliding Window
def max_sum_subarray(arr, k):
    """Maximum sum of subarray of size k."""
    if len(arr) < k:
        return None

    window_sum = sum(arr[:k])
    max_sum = window_sum

    for i in range(k, len(arr)):
        window_sum = window_sum - arr[i - k] + arr[i]
        max_sum = max(max_sum, window_sum)

    return max_sum

# 3. HashMap for Counting
def top_k_frequent(nums, k):
    """Find k most frequent elements."""
    from collections import Counter
    import heapq

    counts = Counter(nums)
    return heapq.nlargest(k, counts.keys(), key=counts.get)

# 4. Binary Search
def binary_search(arr, target):
    """Find target in sorted array."""
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1

    return -1  # Not found

Analytics-Specific Problems

# Problem: Find median of streaming data
import heapq

class MedianFinder:
    """
    Efficient median calculation for streaming data.
    Uses two heaps: max heap for smaller half, min heap for larger half.
    """
    def __init__(self):
        self.small = []  # Max heap (negate values)
        self.large = []  # Min heap

    def add_num(self, num):
        # Add to max heap (small)
        heapq.heappush(self.small, -num)

        # Balance: ensure max of small <= min of large
        if self.small and self.large and (-self.small[0] > self.large[0]):
            val = -heapq.heappop(self.small)
            heapq.heappush(self.large, val)

        # Balance sizes
        if len(self.small) > len(self.large) + 1:
            val = -heapq.heappop(self.small)
            heapq.heappush(self.large, val)
        if len(self.large) > len(self.small):
            val = heapq.heappop(self.large)
            heapq.heappush(self.small, -val)

    def find_median(self):
        if len(self.small) > len(self.large):
            return -self.small[0]
        return (-self.small[0] + self.large[0]) / 2

# Usage: Calculate rolling median for transaction amounts
mf = MedianFinder()
for amount in transaction_stream:
    mf.add_num(amount)
    current_median = mf.find_median()

# Time complexity: O(log n) per insertion, O(1) for median
# Space complexity: O(n)

Time Complexity Cheat Sheet

O(1): Hash table lookup, array access by index
O(log n): Binary search, balanced tree operations
O(n): Linear scan, single loop
O(n log n): Efficient sorting (merge sort, heap sort)
O(n²): Nested loops (avoid when possible!)
O(2ⁿ): Recursive algorithms without memoization (avoid!)

SQL Query Optimization Mindset

Indexes: Like hash tables for databases - O(log n) vs O(n) scan
WHERE before JOIN: Filter early to reduce join size
Avoid SELECT *: Only fetch needed columns
Use EXISTS vs IN: EXISTS stops at first match
Partition pruning: WHERE on partitioned column
Materialized views: Pre-compute expensive aggregations

11. Business Case Study Examples

Case Study 1: Reducing Call Center Costs

Scenario: The call center handles 1M calls/month with an average cost of $5 per call. Leadership wants to reduce costs by 20% while maintaining customer satisfaction.

Approach:

Analyze call drivers: Use text mining on call transcripts to identify top reasons for calls
Identify automation opportunities: Simple inquiries (balance checks, payment status) → chatbot/IVR
Calculate impact: 30% of calls are routine inquiries, automation saves $3.50 per call
Estimate savings: 300K calls × $3.50 = $1.05M/month (21% reduction)
Monitor metrics: CSAT, containment rate, escalation rate, agent handle time
Iterate: Continuous improvement based on feedback and failure analysis

Case Study 2: Improving First Call Resolution

Scenario: Current FCR is 70%. Industry benchmark is 80%. Each repeat call costs $5.

Analysis Framework:

Root cause analysis: Why do customers call back?
- Agent knowledge gaps → training opportunities
- System limitations → tool improvements
- Process issues → workflow optimization
Segmentation: Which customer segments / issue types have lowest FCR?
Agent analysis: Top performers vs bottom performers - what's different?
Data-driven recommendations:
- Implement knowledge management system
- Create decision trees for common scenarios
- Real-time agent assist using ML
Impact sizing: 10% FCR improvement = 100K fewer calls/month = $500K savings

Case Study 3: ML Model for Fraud Detection

Scenario: Build a case for implementing ML-based fraud detection.

Business Case Structure:

Current state:
- Rule-based system flags 5% of transactions
- Manual review costs $2 per transaction
- False positive rate: 90% (high customer friction)
- Fraud losses: $10M annually
Proposed solution: ML model using ensemble methods (Random Forest + XGBoost)
Expected improvements:
- Reduce false positives by 40% → better customer experience
- Increase fraud detection rate by 15% → $1.5M additional fraud caught
- Reduce manual review volume by 35% → $1.4M cost savings
Implementation costs: $500K (data prep, model dev, infrastructure, testing)
ROI: ($1.5M + $1.4M - $500K) / $500K = 480% first year
Success metrics: Precision, Recall, F1, AUC-ROC, cost per fraud detected

12. Behavioral Interview Preparation

Common Behavioral Questions (STAR Method)

"Tell me about a time you had to influence stakeholders without direct authority."
- Situation: Cross-functional project requiring buy-in from product, engineering, design
- Task: Convince teams to prioritize data infrastructure improvements
- Action: Built business case showing impact, created prototype, addressed concerns individually
- Result: Secured resources, delivered 30% reduction in data processing time
"Describe a complex analytical problem you solved independently."
- Situation: Unexplained drop in conversion rates
- Task: Identify root cause with limited guidance
- Action: Segmented users, analyzed funnels, discovered browser-specific bug
- Result: Fixed issue, recovered $500K in monthly revenue
"Give an example of when you had to make a data-driven recommendation to senior leadership."
"Tell me about a time you failed and what you learned."
"Describe how you handled conflicting priorities in an Agile environment."
"Share an example of improving operational efficiency through analytics."

Questions to Ask Interviewers

"What are the biggest challenges the MLIO team is currently facing?"
"How does the team measure success for ML/AI implementations?"
"What does the typical analytics project lifecycle look like here?"
"How does the team balance exploratory analysis with productionizing solutions?"
"What tools and technologies is the team investing in for the future?"
"How does this role interact with product teams during roadmap planning?"
"What opportunities exist for professional development and learning?"
"Can you describe a recent success story from the team?"

Key Themes to Emphasize

Business Impact Focus: Always tie technical work to business outcomes (cost savings, customer experience, revenue)
Communication: Ability to translate technical insights for non-technical audiences
Collaboration: Working effectively with engineering, product, design, and business teams
Agile Mindset: Iterative approach, failing fast, continuous improvement
Independence: Self-starter who can navigate ambiguity and find solutions
Customer-Centric: Understanding how analytics improves customer and agent experience
Learning Agility: Quickly picking up new tools, domains, and methodologies

21. Quick Reference Cheat Sheet

SQL Window Functions Syntax

-- Basic window function template
SELECT
    column1,
    column2,
    WINDOW_FUNCTION() OVER (
        [PARTITION BY partition_column]
        [ORDER BY order_column]
        [ROWS/RANGE BETWEEN ... AND ...]
    ) AS result_column
FROM table;

-- Common window functions:
ROW_NUMBER()  -- Unique sequential number
RANK()        -- Rank with gaps for ties
DENSE_RANK()  -- Rank without gaps
LAG(col, n)   -- Previous nth row value
LEAD(col, n)  -- Next nth row value
FIRST_VALUE() -- First value in window
LAST_VALUE()  -- Last value in window

-- Frame specifications:
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW  -- Running total
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW          -- 7-day window
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING  -- Remaining rows

Python Pandas Quick Reference

# Data loading
df = pd.read_csv('file.csv')
df = pd.read_sql(query, connection)
df = pd.read_parquet('file.parquet')

# Exploration
df.head(), df.tail(), df.sample(5)
df.info(), df.describe()
df.shape, df.columns, df.dtypes
df.isnull().sum()
df.value_counts()

# Filtering
df[df['col'] > 10]
df[df['col'].isin([1, 2, 3])]
df.query('col1 > 10 and col2 == "A"')

# Aggregation
df.groupby('col').agg({'val': ['sum', 'mean', 'count']})
df.pivot_table(values='val', index='row', columns='col', aggfunc='mean')

# Merging
pd.merge(df1, df2, on='key', how='left')
pd.concat([df1, df2], axis=0)  # Stack vertically

# Window operations
df['running_total'] = df.groupby('group')['value'].cumsum()
df['pct_change'] = df['value'].pct_change()
df['rolling_mean'] = df['value'].rolling(window=7).mean()

# Apply functions
df['new_col'] = df['col'].apply(lambda x: x * 2)
df['category'] = df['value'].apply(lambda x: 'high' if x > 100 else 'low')

Statistics Formulas Quick Reference

# Confidence Interval (mean)
CI = x̄ ± (z * σ/√n)
where: z = 1.96 for 95% confidence

# Sample Size (two proportions)
n = (z_α + z_β)² * [p1(1-p1) + p2(1-p2)] / (p1 - p2)²

# Standard Error
SE = σ / √n

# Z-score
z = (x - μ) / σ

# T-statistic (two-sample)
t = (x̄1 - x̄2) / √(s1²/n1 + s2²/n2)

# Chi-square test statistic
χ² = Σ[(O - E)² / E]

# Correlation
r = Σ[(xi - x̄)(yi - ȳ)] / √[Σ(xi - x̄)² * Σ(yi - ȳ)²]

# R-squared
R² = 1 - (SS_res / SS_tot)

ML Model Selection Guide

Problem Type → Algorithm Choice:

CLASSIFICATION (Binary):
├── Linear separable → Logistic Regression
├── Non-linear, need interpretability → Decision Tree
├── High accuracy needed → Random Forest, XGBoost
├── Large dataset, many features → Neural Network
└── Imbalanced classes → Random Forest with class weights, SMOTE

CLASSIFICATION (Multi-class):
├── Simple baseline → Multinomial Logistic Regression
├── Complex patterns → Random Forest, XGBoost
├── Text/images → Neural Networks (CNN, Transformers)
└── Many classes (100+) → Hierarchical models

REGRESSION:
├── Linear relationship → Linear Regression
├── Feature selection needed → Lasso (L1)
├── Multicollinearity → Ridge (L2)
├── Non-linear → Polynomial Regression, Decision Trees
├── High accuracy → Random Forest, XGBoost
└── Time series → ARIMA, Prophet, LSTM

CLUSTERING:
├── Known number of clusters → K-Means
├── Density-based → DBSCAN
├── Hierarchical structure → Hierarchical Clustering
└── Mixed data types → K-Prototypes

DIMENSIONALITY REDUCTION:
├── Linear → PCA
├── Non-linear → t-SNE, UMAP
└── Supervised → LDA

Interview Question Frameworks

SQL PROBLEM FRAMEWORK:
Clarify requirements (what's the desired output?)
Identify tables and relationships
Break problem into steps (CTEs)
Write query incrementally
Test with edge cases
Optimize if needed (indexes, window functions vs subqueries)

PYTHON PROBLEM FRAMEWORK:
Clarify inputs/outputs and constraints
Discuss approach at high level
Consider edge cases
Write code with clear variable names
Test with examples
Discuss time/space complexity

CASE STUDY FRAMEWORK:
Clarify the business context and goal
Define success metrics
Identify data needed
Propose analysis approach
Discuss potential findings and recommendations
Consider implementation challenges

BEHAVIORAL (STAR):
Situation (2 sentences max - set context)
Task (1 sentence - your specific responsibility)
Action (3-4 specific actions YOU took)
Result (quantified outcome + learning)

22. Essential Formulas & Concepts

Business Metrics Calculations

# Customer Acquisition Cost (CAC)
CAC = Total Marketing & Sales Spend / Number of New Customers

# Customer Lifetime Value (CLV)
CLV = (Average Purchase Value × Purchase Frequency × Customer Lifespan) - CAC

# Churn Rate
Monthly Churn = (Customers Lost in Month / Customers at Start of Month) × 100

# Net Revenue Retention (NRR)
NRR = (Starting MRR + Expansion - Contraction - Churn) / Starting MRR × 100

# Monthly Active Users (MAU)
MAU = Count of unique users with activity in last 30 days

# DAU/MAU Ratio (Stickiness)
Stickiness = (DAU / MAU) × 100
Target: >20% is good, >50% is excellent

# Conversion Rate
Conversion Rate = (Conversions / Total Visitors) × 100

# Average Revenue Per User (ARPU)
ARPU = Total Revenue / Number of Users

# Payback Period
Payback Period = CAC / (ARPU × Gross Margin %)

Statistical Tests Decision Tree

Choosing the Right Test:

COMPARING MEANS:
├── Two groups, continuous outcome
│   ├── Normal distribution, equal variance → Independent t-test
│   ├── Non-normal OR unequal variance → Welch's t-test
│   └── Paired/matched data → Paired t-test
├── Three+ groups → ANOVA (if normal) or Kruskal-Wallis (if not)
└── Before/after single group → Paired t-test or Wilcoxon signed-rank

COMPARING PROPORTIONS:
├── Two proportions → Two-proportion z-test
├── More than two proportions → Chi-square test
└── Small sample (<30) → Fisher's exact test

RELATIONSHIPS:
├── Two continuous variables → Pearson correlation (linear) or Spearman (monotonic)
├── Categorical × Categorical → Chi-square test of independence
└── Predict continuous from predictors → Linear regression

ASSUMPTIONS TO CHECK:
├── Normality: Shapiro-Wilk test, Q-Q plot, histogram
├── Equal variance: Levene's test, F-test
├── Independence: Study design, no autocorrelation
└── Sample size: Rule of thumb n ≥ 30 for CLT

Time Complexity Examples

# O(1) - Constant
hash_map[key]  # Dictionary lookup
array[index]   # Array access

# O(log n) - Logarithmic
binary_search(sorted_array, target)
balanced_tree.search(value)

# O(n) - Linear
for item in list:
    process(item)
sum(array)
linear_search(array, target)

# O(n log n) - Linearithmic
sorted(array)  # Timsort, Mergesort
heapq.nlargest(k, array)  # when k ~= n

# O(n²) - Quadratic
for i in array:
    for j in array:
        compare(i, j)
bubble_sort(array)

# O(2^n) - Exponential (AVOID!)
def fibonacci_recursive(n):  # Without memoization
    if n <= 1: return n
    return fibonacci_recursive(n-1) + fibonacci_recursive(n-2)

# Optimization: Use memoization or dynamic programming
@lru_cache(maxsize=None)
def fibonacci_memoized(n):  # Now O(n)
    if n <= 1: return n
    return fibonacci_memoized(n-1) + fibonacci_memoized(n-2)

23. Real JPMorgan Interview Questions

Technical Round Questions (Glassdoor/Blind Compilation)

SQL:
- "Write a query to find the second highest salary in each department"
- "Calculate running total of transactions by customer, ordered by date"
- "Find customers who made purchases in all 12 months of the year"
- "Identify gaps in sequential transaction IDs"
- "Calculate month-over-month growth rate for each product"
Python/Pandas:
- "Given a DataFrame of transactions, find customers with declining spend over last 3 months"
- "Implement a function to detect outliers using IQR method"
- "Merge two datasets on approximate string matching (fuzzy matching)"
- "Calculate cohort retention rates from transaction data"
Statistics:
- "How would you detect if a coin is biased?"
- "Explain p-value to a non-technical stakeholder"
- "You run an A/B test and get p=0.051. What do you do?"
- "How do you determine sample size for an experiment?"
Machine Learning:
- "How would you build a model to predict credit card fraud?"
- "Explain the difference between precision and recall, and when each matters"
- "How do you handle imbalanced datasets?"
- "Walk me through your process for feature engineering"
- "How do you prevent overfitting?"

Case Study Examples

"Credit card approval rates dropped 15% last month. Investigate."
- Framework: Segment by applicant type, product, channel, geography, time
- Check: Model changes, policy changes, data quality, external factors (economy, marketing)
- Action: Root cause analysis, quantify impact, recommend fixes
"Design a dashboard for contact center operations leaders."
- Stakeholders: Ops managers, team leads, executives
- Metrics: Volume, AHT, FCR, CSAT, agent utilization, SLA
- Features: Real-time view, trends, alerts, drill-downs by channel/reason/agent
"Should we invest in a chatbot for customer service?"
- Define success: Cost per interaction, CSAT, containment rate
- Estimate: Addressable call volume, cost savings, development cost
- ROI calculation: Savings vs investment, payback period
- Risks: CSAT decline, implementation challenges, adoption
"Analyze mobile app ratings decline from 4.5 to 4.0."
- Data: App store reviews, version history, support tickets, usage analytics
- Analysis: Text mining of reviews, correlation with app versions, platform differences
- Findings: Recent update (v3.2) introduced bugs for iOS 14 users
- Recommendation: Hotfix release, proactive communication, improved QA

Behavioral Questions (JPM-Specific)

"Tell me about a time you had to work with incomplete data"
"Describe a situation where your analysis changed a business decision"
"How do you prioritize when you have multiple urgent requests?"
"Tell me about a time you had to explain a technical concept to a non-technical audience"
"Describe a project where you had to collaborate with engineering and product teams"
"What do you know about JPMorgan Chase's consumer banking business?"
"Why do you want to work in the ML & Intelligent Operations team specifically?"
"How would you approach learning about a new business domain?"
"Tell me about a time you identified an opportunity that others missed"
"Describe your experience with Agile/Scrum methodology"