Scenario: user has a subscription where the size of the product can be updated. This results in the following data set
With subscriptions as (
SELECT "{"currentSize":"2", "sizeHistory":[{"from":"1", "to":"2", "timestamp":{"_seconds":1588543200}}], "createdAt": {"_seconds":1587543200}}" as data,
"docWithSingleHistory" as document_name UNION ALL
SELECT "{"currentSize":"3", "sizeHistory":[{"from":"1", "to":"2", "timestamp":{"_seconds":1588543200}}, {"from":"2", "to":"3", "timestamp":{"_seconds":1589543200}}], "createdAt": {"_seconds":1587543200}}" as data,
"docWithMoreHistory" as document_name UNION ALL
SELECT "{"currentSize":"3", "createdAt": {"_seconds":1587543200}}" as data,
"docWithoutHistory" as document_name
)
select document_name, data from subscriptions
The subscription has a current size and an optional size history, which contains the updates and when they have been made.
My goals are to:
- get the initial size of each subscription. If there is no history, it would be the createdAt timestamp. If there is history, it would be the “from” value of the sizeHistory array member the lowest timestamp
- get the average value of the time it takes to go from one size to the other.
I got stuck on the first goal already without even considering the subscription without history case. Here’s what I have so far
With subscriptions as (
SELECT "{"currentSize":"2", "sizeHistory":[{"from":"1", "to":"2", "timestamp":{"_seconds":1588543201}}], "createdAt": {"_seconds":1587543200}}" as data,
"docWithSingleHistory" as document_name UNION ALL
SELECT "{"currentSize":"3", "sizeHistory":[{"from":"1", "to":"2", "timestamp":{"_seconds":1588543202}}, {"from":"2", "to":"3", "timestamp":{"_seconds":1589543200}}], "createdAt": {"_seconds":1587543200}}" as data,
"docWithMoreHistory" as document_name UNION ALL
SELECT "{"currentSize":"3", "createdAt": {"_seconds":1587543200}}" as data,
"docWithoutHistory" as document_name
)
select
document_name,
JSON_EXTRACT_SCALAR(sizeHistory, "$.from") as start_size,
JSON_EXTRACT_SCALAR(sizeHistory, "$.timestamp['_seconds']") as timestamp,
from (
select MIN(JSON_EXTRACT(sizeHistoryDoc, "$.timestamp['_seconds']")) as minStartDate
from subscriptions, UNNEST(JSON_EXTRACT_ARRAY(data, "$.sizeHistory")) as sizeHistoryDoc
), subscriptions, UNNEST(JSON_EXTRACT_ARRAY(data, "$.sizeHistory")) as sizeHistory
where JSON_EXTRACT_SCALAR(sizeHistory, "$.timestamp['_seconds']") = minStartDate
The main reasoning was: for each subscription, get the correspondent sizeHistory element with the minimum timestamp. The problem is that the where condition is on the entire dataset, so I get just one subscriptions out (the one with the minimum timestamp).
Here’s an example of my ideal results structure (not based on the dummy data provided above):
| ------------------------------------ |
| start size | number of subscriptions |
| -------------------------------------|
| 1 | 2 |
| 2 | 10 |
| -------------------------------------|
| -----------------------------------------------------------|
| change | number of subscriptions | AVG days to change |
| -----------------------------------------------------------|
| 1 to 2 | 5 | 30 |
| 2 to 3 | 2 | 20 |
| -----------------------------------------------------------|
I’m stuck on this for some days already, so any solution/help/hint would be really useful.
Advertisement
Answer
Please see my query below. I use several CTEs to break down the logic into manageable chunks; change the select statement at the end of the query to see what each is doing.
With subscriptions as (
SELECT "{"currentSize":"2", "sizeHistory":[{"from":"1", "to":"2", "timestamp":{"_seconds":1588543200}}], "createdAt": {"_seconds":1587543200}}" as data,
"docWithSingleHistory" as document_name UNION ALL
SELECT "{"currentSize":"3", "sizeHistory":[{"from":"1", "to":"2", "timestamp":{"_seconds":1588543200}}, {"from":"2", "to":"3", "timestamp":{"_seconds":1589543200}}], "createdAt": {"_seconds":1587543200}}" as data,
"docWithMoreHistory" as document_name UNION ALL
SELECT "{"currentSize":"3", "createdAt": {"_seconds":1587543200}}" as data,
"docWithoutHistory" as document_name
),
extract_json1 as (
select
document_name,
json_extract_scalar(data,'$.currentSize') as current_size,
json_extract_array(data,'$.sizeHistory') as size_history,
json_extract(data,'$.createdAt._seconds') as created_at_seconds
from subscriptions
order by document_name
),
extract_json2 as (
select
document_name,
current_size,
created_at_seconds,
json_extract_scalar(s,'$.from') as size_history_from,
json_extract_scalar(s,'$.to') as size_history_to,
json_extract_scalar(s,'$.timestamp._seconds') as size_history_seconds
from extract_json1
left join unnest(size_history) s
order by document_name
),
working as (
select
document_name,
cast(current_size as int64) as current_size,
cast(created_at_seconds as int64) as created_at_seconds,
cast(size_history_from as int64) as size_history_from,
cast(size_history_to as int64) as size_history_to,
cast(size_history_seconds as int64) as size_history_seconds
from extract_json2
order by document_name, size_history_seconds
),
results1_temp as (
select
*,
row_number() over (partition by document_name order by size_history_seconds asc) as size_history_order
from working
order by document_name, size_history_seconds
),
results1 as (
select
coalesce(size_history_from,current_size) as start_size,
count(distinct document_name) as number_of_subscriptions
from results1_temp
where size_history_order = 1
group by 1
),
results2_temp as (
select
document_name,
size_history_from,
size_history_to,
size_history_seconds,
ifnull(lag(size_history_seconds,1) over(partition by document_name order by size_history_seconds asc),created_at_seconds) as prev_size_seconds
from working
order by document_name,size_history_seconds
),
results2 as (
select
concat(size_history_from,' to ',size_history_to) as change,
count(distinct document_name) as number_of_subscriptions,
avg( (ifnull(size_history_seconds,0)-ifnull(prev_size_seconds,0))/(60*60*24) ) as avg_days_to_change
from results2_temp
where size_history_from is not null
group by 1
)
select * from results1
-- select * from results2
*Note: remove the interior CTE order by
statements (but not in the window functions) before you run this against any actual data as it adds extra overhead.