Advanced Problems
Focus: point-in-time correctness, intervals, many-to-many joins, advanced windows, and production-grade reasoning.
1. Build a point-in-time feature table for each prediction point with latest feature values.
WITH ranked_features AS (
SELECT
p.entity_id,
p.prediction_ts,
f.feature_name,
f.feature_value,
ROW_NUMBER() OVER (
PARTITION BY p.entity_id, p.prediction_ts, f.feature_name
ORDER BY f.snapshot_ts DESC
) AS rn
FROM prediction_points p
LEFT JOIN feature_snapshots f
ON f.entity_id = p.entity_id
AND f.snapshot_ts <= p.prediction_ts
)
SELECT entity_id, prediction_ts, feature_name, feature_value
FROM ranked_features
WHERE rn = 1;
The left join preserves prediction points. Some engines require careful handling because rows with no feature produce one NULL feature row.
2. Generate fraud labels: chargeback within 30 days after payment.
SELECT
p.payment_id,
p.order_id,
p.payment_ts AS prediction_ts,
CASE
WHEN EXISTS (
SELECT 1
FROM chargebacks c
WHERE c.payment_id = p.payment_id
AND c.chargeback_ts > p.payment_ts
AND c.chargeback_ts <= p.payment_ts + INTERVAL '30' DAY
)
THEN 1
ELSE 0
END AS fraud_label
FROM payments p
WHERE p.status = 'succeeded';
Make sure chargeback data would not be used as a feature before the label window.
3. Sessionize events with a 30-minute inactivity threshold.
WITH ordered AS (
SELECT
user_id,
event_ts,
event_id,
LAG(event_ts) OVER (
PARTITION BY user_id
ORDER BY event_ts, event_id
) AS prev_event_ts
FROM events
),
marked AS (
SELECT
*,
CASE
WHEN prev_event_ts IS NULL THEN 1
WHEN event_ts > prev_event_ts + INTERVAL '30' MINUTE THEN 1
ELSE 0
END AS new_session_flag
FROM ordered
),
numbered AS (
SELECT
*,
SUM(new_session_flag) OVER (
PARTITION BY user_id
ORDER BY event_ts, event_id
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS session_number
FROM marked
)
SELECT
user_id,
session_number,
MIN(event_ts) AS started_at,
MAX(event_ts) AS ended_at,
COUNT(*) AS events
FROM numbered
GROUP BY user_id, session_number;
4. Find the longest streak of paid-order days per customer.
WITH paid_days AS (
SELECT DISTINCT customer_id, order_day_number
FROM orders
WHERE status = 'paid'
),
numbered AS (
SELECT
customer_id,
order_day_number,
order_day_number - ROW_NUMBER() OVER (
PARTITION BY customer_id
ORDER BY order_day_number
) AS island_key
FROM paid_days
),
streaks AS (
SELECT
customer_id,
MIN(order_day_number) AS start_day,
MAX(order_day_number) AS end_day,
COUNT(*) AS streak_days
FROM numbered
GROUP BY customer_id, island_key
)
SELECT customer_id, MAX(streak_days) AS longest_streak_days
FROM streaks
GROUP BY customer_id;
order_day_number is a portable stand-in for date arithmetic.
5. Attribute each purchase to the most recent marketing touch before purchase.
WITH candidates AS (
SELECT
o.order_id,
o.customer_id,
o.order_ts,
t.touch_id,
t.channel,
ROW_NUMBER() OVER (
PARTITION BY o.order_id
ORDER BY t.touch_ts DESC, t.touch_id DESC
) AS rn
FROM orders o
JOIN marketing_touches t
ON t.customer_id = o.customer_id
AND t.touch_ts <= o.order_ts
WHERE o.status = 'paid'
)
SELECT order_id, customer_id, order_ts, touch_id, channel
FROM candidates
WHERE rn = 1;
Add an attribution lookback window if the business requires one.
6. Compute retained users by cohort month and age month.
WITH first_seen AS (
SELECT user_id, MIN(event_month) AS cohort_month
FROM events
GROUP BY user_id
),
monthly_activity AS (
SELECT DISTINCT user_id, event_month
FROM events
),
cohort_activity AS (
SELECT
f.cohort_month,
a.event_month,
a.user_id
FROM first_seen f
JOIN monthly_activity a
ON a.user_id = f.user_id
AND a.event_month >= f.cohort_month
)
SELECT
cohort_month,
event_month,
COUNT(*) AS retained_users
FROM cohort_activity
GROUP BY cohort_month, event_month;
Use engine-specific date diff to derive age_month.
7. Find overlapping subscriptions for the same customer.
SELECT
a.customer_id,
a.subscription_id AS subscription_a,
b.subscription_id AS subscription_b
FROM subscriptions a
JOIN subscriptions b
ON b.customer_id = a.customer_id
AND b.subscription_id > a.subscription_id
AND a.started_at < b.ended_at
AND b.started_at < a.ended_at;
The overlap rule for half-open intervals is a.start < b.end AND b.start < a.end.
8. Detect many-to-many join amplification between orders and promotions.
WITH joined AS (
SELECT
o.order_id,
COUNT(*) AS joined_rows
FROM orders o
JOIN promotions p
ON p.customer_id = o.customer_id
AND o.order_ts >= p.valid_from
AND o.order_ts < p.valid_to
GROUP BY o.order_id
)
SELECT *
FROM joined
WHERE joined_rows > 1
ORDER BY joined_rows DESC;
Then define tie-breaking or attribution rules before summing revenue.
9. Compute revenue percentile buckets per customer.
Portable exact percentile functions vary. Use ranking buckets as a generic approach.
WITH revenue AS (
SELECT customer_id, SUM(total_amount) AS paid_revenue
FROM orders
WHERE status = 'paid'
GROUP BY customer_id
),
bucketed AS (
SELECT
customer_id,
paid_revenue,
NTILE(10) OVER (ORDER BY paid_revenue) AS decile
FROM revenue
)
SELECT decile, COUNT(*) AS customers, MIN(paid_revenue), MAX(paid_revenue)
FROM bucketed
GROUP BY decile;
NTILE creates roughly equal row-count buckets, not equal revenue ranges.
10. Build a confusion matrix by model version.
SELECT
p.model_version,
SUM(CASE WHEN p.score >= 0.5 AND l.label_value = 1 THEN 1 ELSE 0 END) AS tp,
SUM(CASE WHEN p.score >= 0.5 AND l.label_value = 0 THEN 1 ELSE 0 END) AS fp,
SUM(CASE WHEN p.score < 0.5 AND l.label_value = 1 THEN 1 ELSE 0 END) AS fn,
SUM(CASE WHEN p.score < 0.5 AND l.label_value = 0 THEN 1 ELSE 0 END) AS tn
FROM predictions p
JOIN labels l
ON l.entity_id = p.entity_id
AND l.label_ts = p.prediction_ts
GROUP BY p.model_version;
Make the threshold a parameter in real evaluation.
11. Find model score drift by comparing current week to previous week buckets.
WITH bucketed AS (
SELECT
model_version,
score_week,
CASE
WHEN score < 0.1 THEN '00-10'
WHEN score < 0.2 THEN '10-20'
WHEN score < 0.3 THEN '20-30'
WHEN score < 0.4 THEN '30-40'
WHEN score < 0.5 THEN '40-50'
WHEN score < 0.6 THEN '50-60'
WHEN score < 0.7 THEN '60-70'
WHEN score < 0.8 THEN '70-80'
WHEN score < 0.9 THEN '80-90'
ELSE '90-100'
END AS score_bucket,
COUNT(*) AS predictions
FROM predictions
GROUP BY model_version, score_week, score_bucket
)
SELECT
model_version,
score_week,
score_bucket,
predictions,
LAG(predictions) OVER (
PARTITION BY model_version, score_bucket
ORDER BY score_week
) AS previous_predictions
FROM bucketed;
Normalize to proportions before alerting, because total traffic can change.
12. Compute exact median order value per country without a percentile function.
This uses row numbers and counts. It averages the two middle rows for even counts.
WITH ordered AS (
SELECT
c.country,
o.total_amount,
ROW_NUMBER() OVER (
PARTITION BY c.country
ORDER BY o.total_amount
) AS rn,
COUNT(*) OVER (
PARTITION BY c.country
) AS cnt
FROM orders o
JOIN customers c
ON c.customer_id = o.customer_id
WHERE o.status = 'paid'
)
SELECT
country,
AVG(total_amount) AS median_order_value
FROM ordered
WHERE rn IN ((cnt + 1) / 2, (cnt + 2) / 2)
GROUP BY country;
Integer division behavior varies by engine. In interviews, explain the intent if syntax needs adjustment.
13. For each entity, join a label to the nearest prediction before label time.
WITH candidates AS (
SELECT
l.entity_id,
l.label_ts,
l.label_value,
p.prediction_ts,
p.score,
ROW_NUMBER() OVER (
PARTITION BY l.entity_id, l.label_ts
ORDER BY p.prediction_ts DESC
) AS rn
FROM labels l
JOIN predictions p
ON p.entity_id = l.entity_id
AND p.prediction_ts <= l.label_ts
)
SELECT entity_id, label_ts, label_value, prediction_ts, score
FROM candidates
WHERE rn = 1;
This is an as-of join in the reverse direction.
14. Identify users whose purchase event appears before assignment to an experiment.
SELECT
x.user_id,
x.experiment_name,
x.assigned_ts,
MIN(e.event_ts) AS first_purchase_ts
FROM experiments x
JOIN events e
ON e.user_id = x.user_id
AND e.event_name = 'purchase'
GROUP BY x.user_id, x.experiment_name, x.assigned_ts
HAVING MIN(e.event_ts) < x.assigned_ts;
This can reveal experiment contamination or reused users from previous tests.
15. Create an order lifecycle table from event logs.
SELECT
order_id,
MIN(CASE WHEN event_name = 'created' THEN event_ts END) AS created_ts,
MIN(CASE WHEN event_name = 'paid' THEN event_ts END) AS paid_ts,
MIN(CASE WHEN event_name = 'shipped' THEN event_ts END) AS shipped_ts,
MIN(CASE WHEN event_name = 'delivered' THEN event_ts END) AS delivered_ts,
MIN(CASE WHEN event_name = 'cancelled' THEN event_ts END) AS cancelled_ts
FROM order_events
GROUP BY order_id;
Then add validation: paid after created, shipped after paid, delivered after shipped, cancelled conflicts with delivered unless business rules allow it.