how to use first_value with reset

WITH t AS
(
    SELECT
        *
    FROM
        (VALUES ('2021-10-09 23:58:34.000', '2021-10-10 00:00:00.000', 'task_completed', '00001'),
                ('2021-10-10 00:00:01.000', '2021-10-10 00:00:05.000', 'task_completed', '00001'),
                ('2021-10-10 00:00:06.000', '2021-10-10 00:00:25.000',  'task_completed', '00001'),
                ('2021-10-10 00:00:26.000', '2021-10-10 00:00:45.000', 'task_not_completed', '00001'),
                ('2021-10-10 00:00:46.000', '2021-10-10 00:01:00.000', 'task_not_completed', '00001'),
                ('2021-10-10 00:01:01.000', '2021-10-10 00:01:10.000', 'task_completed', '00001'),
                ('2021-10-10 00:01:11.000', '2021-10-10 00:01:15.000', 'task_completed', '00001')) AS t(start_time, end_time, task_state, person_id)
    ORDER BY 
        1
)
SELECT * 
FROM t

​x
 
WITH t AS(    SELECT        *    FROM        (VALUES ('2021-10-09 23:58:34.000', '2021-10-10 00:00:00.000', 'task_completed', '00001'),                ('2021-10-10 00:00:01.000', '2021-10-10 00:00:05.000', 'task_completed', '00001'),                ('2021-10-10 00:00:06.000', '2021-10-10 00:00:25.000',  'task_completed', '00001'),                ('2021-10-10 00:00:26.000', '2021-10-10 00:00:45.000', 'task_not_completed', '00001'),                ('2021-10-10 00:00:46.000', '2021-10-10 00:01:00.000', 'task_not_completed', '00001'),                ('2021-10-10 00:01:01.000', '2021-10-10 00:01:10.000', 'task_completed', '00001'),                ('2021-10-10 00:01:11.000', '2021-10-10 00:01:15.000', 'task_completed', '00001')) AS t(start_time, end_time, task_state, person_id)    ORDER BY         1)SELECT * FROM t​

I am trying to obtain an aggregated result that appears like this:

start_time	end_time	task_state
‘2021-10-09 23:58:34.000	2021-10-10 00:00:25.000′	task_completed
‘2021-10-10 00:00:26.000	2021-10-10 00:01:00.000′	task_not_completed
‘2021-10-10 00:01:01.000	2021-10-10 00:01:15.000′	task_completed

I have tried using the first_value function but it just seems to give the value ‘2021-10-09 23:58:34.000’ for all the task_state.

Not sure what I am doing wrong. Here is what I tried:

SELECT
    start_time,  
    FIRST_VALUE(start_time) OVER (ORDER BY start_time) AS end_time, 
    MIN(task_state) OVER (ORDER BY start_time) AS state
FROM
    t

 
SELECT    start_time,      FIRST_VALUE(start_time) OVER (ORDER BY start_time) AS end_time,     MIN(task_state) OVER (ORDER BY start_time) AS stateFROM    t​

looking for help for this

Answer

its a gaps & island problem:

SELECT person_id, task_state, MIN(start_time) start_time, MAX(end_time) end_time  
from (
   SELECT * 
       , ROW_NUMBER() OVER (PARTITION BY t.person_id ORDER BY start_time)
       - ROW_NUMBER() OVER (PARTITION BY t.person_id, t.task_state ORDER BY start_time) AS taskgroups
   FROM t
) tt GROUP BY taskgroups, person_id, task_state

 
SELECT person_id, task_state, MIN(start_time) start_time, MAX(end_time) end_time  from (   SELECT *        , ROW_NUMBER() OVER (PARTITION BY t.person_id ORDER BY start_time)       - ROW_NUMBER() OVER (PARTITION BY t.person_id, t.task_state ORDER BY start_time) AS taskgroups   FROM t) tt GROUP BY taskgroups, person_id, task_state​

db<>fiddle here

Advertisement

Answer