add batch remove batch split batch comment selection show hidden batches hide batch highlight batch
db<>fiddle
donate feedback about
By using db<>fiddle, you agree to license everything you submit by Creative Commons CC0.
Help with an interesting Postgres question: Why isn't an Index Only Scan used on a partition accessed via the parent table?.
-- table for storing data processing status

CREATE TABLE status_table (
base_name text NOT NULL
, version smallint NOT NULL
, ref_time int NOT NULL
, processed bool NOT NULL
, processing bool NOT NULL
, updated int NOT NULL
, PRIMARY KEY (base_name, version)
);
INSERT INTO status_table VALUES
-- --------- new ref_time batch 27793000 ----------
-- settled and complete
-- ------------------------------------------------
('abc', 1, 27793000, 't', 'f', (extract(epoch from now()) / 60) - 500)
, ('jkl', 1, 27793000, 't', 'f', (extract(epoch from now()) / 60) - 450)
, ('pqr', 1, 27793000, 't', 'f', (extract(epoch from now()) / 60) - 400)
-- --------- new ref_time batch 27793300 ----------
-- settled and complete
-- ------------------------------------------------
, ('def', 1, 27793300, 't', 'f', (extract(epoch from now()) / 60) - 350)
, ('hft', 1, 27793300, 't', 'f', (extract(epoch from now()) / 60) - 300)
-- --------- new ref_time batch 27793600 ----------
-- settled and NOT complete
-- ------------------------------------------------
, ('ghi', 1, 27793600, 'f', 't', (extract(epoch from now()) / 60) - 275)
, ('wqt', 1, 27793600, 't', 'f', (extract(epoch from now()) / 60) - 250)
-- --------- new ref_time batch 27793900 ----------
-- NOT settled and NOT complete
-- ------------------------------------------------
, ('rfe', 1, 27793900, 't', 'f', (extract(epoch from now()) / 60) - 225)
-- any 'updated' time above here (threshold is -200) counts as 'settled' (if all in ref_time batch are settled)
, ('mno', 1, 27793900, 'f', 't', (extract(epoch from now()) / 60) - 175)
, ('dpw', 1, 27793900, 't', 'f', (extract(epoch from now()) / 60) - 150)
CREATE TABLE
INSERT 0 12
-- the actual data table

CREATE TABLE data_table (
location text NOT NULL
, param_id text NOT NULL
, ref_time int NOT NULL
, fcst_time smallint NOT NULL
, timestamp int NOT NULL
, value text NOT NULL
, PRIMARY KEY (location, param_id, ref_time, fcst_time)
);

-- note that: 'ref_time' + 'fcst_time' = 'timestamp'
-- I know it's not ideal to store 'timestamp' when it can be derived from 'ref_time' and 'fcst_time' (could just store those two)
-- but all three bits of info are in the raw data so I store all three rather than deriving 'timestamp' after extraction
-- maybe I could/should reduce table storage by storing only 'ref_time' and 'fcst_time' and deriving 'timestamp'?

INSERT INTO data_table VALUES
-- --------- new ref_time batch 27793000 ----------
('aaa', 'temp', 27793000, 0, 27793000, '21.0')
, ('aaa', 'temp', 27793000, 100, 27793100, '20.1')
, ('aaa', 'temp', 27793000, 200, 27793200, '19.6')
, ('aaa', 'temp', 27793000, 300, 27793300, '18.3')
, ('aaa', 'temp', 27793000, 400, 27793400, '17.1')
, ('aaa', 'temp', 27793000, 500, 27793500, '16.4')
, ('aaa', 'temp', 27793000, 600, 27793600, '16.4')
, ('aaa', 'temp', 27793000, 700, 27793700, '16.4')
, ('aaa', 'temp', 27793000, 800, 27793800, '16.4')
, ('aaa', 'temp', 27793000, 900, 27793900, '16.4')
-- new param in same ref_time batch
-- note that the timestamps overlap with previous batch
, ('aaa', 'humi', 27793000, 0, 27793000, '55.0')
, ('aaa', 'humi', 27793000, 100, 27793100, '57.1')
, ('aaa', 'humi', 27793000, 200, 27793200, '62.6')
, ('aaa', 'humi', 27793000, 300, 27793300, '76.3')
, ('aaa', 'humi', 27793000, 400, 27793400, '72.1')
CREATE TABLE
INSERT 0 44
-- just a query to show some useful stats from status_table
-- not all of it is needed for the next stage... just useful to visualise
SELECT ref_time
, max(updated) AS max_updated
, count(*) AS total
, count(*) FILTER (WHERE processed) AS processed
, round(count(*) FILTER (WHERE processed) * 100.0 / count(*), 1) AS percent
, round(round(count(*) FILTER (WHERE processed) * 1.0 / count(*), 1) * 100) AS rounded
-- a batch is 'complete' when all files in the batch have been processed:
, count(*) FILTER (WHERE processed) = count(*) AS complete
-- we assume no more data coming in ('settled') when a batch hasn't been updated in a while:
, max(updated) < (round(extract(epoch from now()) / 60) - 200) AS settled
-- ready is when the batch is 'complete' AND 'settled':
, (count(*) FILTER (WHERE processed) = count(*)) AND (max(updated) < (round(extract(epoch from now()) / 60) - 200)) AS ready
FROM status_table
GROUP BY ref_time
ORDER BY ready DESC, rounded DESC, ref_time DESC
ref_time max_updated total processed percent rounded complete settled ready
27793300 28680249 2 2 100.0 100 t t t
27793000 28680149 3 3 100.0 100 t t t
27794200 28680449 2 2 100.0 100 t f f
27793900 28680399 3 2 66.7 70 f f f
27793600 28680299 2 1 50.0 50 f t f
SELECT 5
WITH
stats AS (
SELECT ref_time
, count(*) FILTER (WHERE processed) = count(*) AS complete
, round(round(count(*) FILTER (WHERE processed) * 1.0 / count(*), 1) * 100) AS rounded
, max(updated) < (round(extract(epoch from now()) / 60) - 200) AS settled
, (count(*) FILTER (WHERE processed) = count(*)) AND (max(updated) < (round(extract(epoch from now()) / 60) - 200)) AS ready
FROM status_table
GROUP BY ref_time
ORDER BY ready DESC, rounded DESC, ref_time DESC
),
first AS (
SELECT ref_time FROM stats LIMIT 1
),
ref1 AS (
-- select all ready batches other than the first in order above
-- but only delete from those other batches where timestamp is less than max timestamp of first batch (see maxts CTE below)
SELECT ref_time
FROM stats
WHERE ready AND ref_time != (SELECT ref_time FROM first)
),
ref2 AS (
-- select all settled but incomplete batches
-- we can delete all records in these
SELECT ref_time
FROM stats
WHERE settled AND NOT complete
),
maxts AS (
SELECT ref_time, max(timestamp) AS max_timestamp
FROM data_table
WHERE ref_time = (SELECT ref_time FROM first)
GROUP BY ref_time
)
DELETE FROM data_table AS d
USING ref2 AS r2
DELETE 6