Skip to content
Advertisement

AWS Athena: Unsupported correlated subquery type but it works on Mysql and other flavors

For each order in orders table, I’m trying to return the latest order update value based on time_sent, grouped into a single row.

CREATE TABLE order_updates (time_sent INTEGER, order_id INTEGER, a VARCHAR(2), b VARCHAR(2), c VARCHAR(2), d VARCHAR(2));

INSERT INTO order_updates (time_sent, order_id, a, b, c, d) VALUES
('1', '1', 'a1', null, null, null),
('2', '1', null, 'b1', null, null),
('3', '1', null, null, 'c0', null),
('4', '1', 'a3', null, 'c3', null),
('5', '1', 'a0', null, null, null),
('6', '2', 'a3', 'b0', null, 'd2'),
('7', '2', null, 'b3', null, null);

CREATE TABLE orders (order_id INTEGER, name VARCHAR(32));
insert into orders (order_id, name) values (1, "joe"), (2, "bill");

select *, (select order_updates.a from order_updates where order_updates.order_id = orders.order_id and order_updates.a is not null order by order_updates.time_sent desc limit 1) a,
(select order_updates.b from order_updates where order_updates.order_id = orders.order_id and order_updates.b is not null order by order_updates.time_sent desc limit 1) b,
(select order_updates.c from order_updates where order_updates.order_id = orders.order_id and order_updates.c is not null order by order_updates.time_sent desc limit 1) c,
(select order_updates.d from order_updates where order_updates.order_id = orders.order_id and order_updates.d is not null order by order_updates.time_sent desc limit 1) d
FROM orders;

The above select query works fine in MySql but not in AWS Athena.

Working MySQL example below:

https://dbfiddle.uk/?rdbms=mysql_8.0&fiddle=66f17cf9f3c8f19f758c7cb1381d8207

Is there a way to accomplish the above using joins?

Advertisement

Answer

It’s possible to achieve this with Window Functions – as available in Athena/Presto 0.172, here an example for getting column a and b, extensible for c and d:

with order_updates_a as
(
    select order_id,a
    ,ROW_NUMBER()OVER(partition by order_id order by time_sent desc) nr
    from order_updates
    where a <> '' and a is not null
)
,order_updates_b as
(
    select order_id,b
    ,ROW_NUMBER()OVER(partition by order_id order by time_sent desc) nr
    from order_updates
    where b <> '' and b is not null
)
,order_updates_c as
(
    select order_id,c
    ,ROW_NUMBER()OVER(partition by order_id order by time_sent desc) nr
    from order_updates
    where c <> '' and c is not null
)
,order_updates_d as
(
    select order_id,d
    ,ROW_NUMBER()OVER(partition by order_id order by time_sent desc) nr
    from order_updates
    where d <> '' and d is not null
)
select o.*
,a.a
,b.b
,c.c
,d.d
from order o
left join order_updates_a a
    on o.order_id = a.order_id and a.nr = 1
left join order_updates_b b
    on o.order_id = b.order_id and b.nr = 1
left join order_updates_c c
    on o.order_id = c.order_id and c.nr = 1
left join order_updates_d d
    on o.order_id = d.order_id and d.nr = 1
User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement