By using db<>fiddle, you agree to license everything you submit by Creative Commons CC0.
Help with an interesting Postgres question: Why isn't an Index Only Scan used on a partition accessed via the parent table?.
CREATE TABLE Measures(
expId SERIAL,
iteration INT NOT NULL,
value float4 NOT NULL,
PRIMARY KEY(expId, iteration)
);
--So, a table of various measurements, repeated for n iterations.
--Though, because we have more data than originally expected,
--I want to move to a new table layout that instead uses an array column,
--which overall gives better performance (already tested and benchmarked):
CREATE TABLE TmpMeasures(
expId SERIAL PRIMARY KEY,
values float4[] NOT NULL
);
--My problem now is how to get the old data into the new format.
--The data may look something like this.
--Not that iterations don't always produce all data,
--so there may be NULL values in the final array:
select setseed(.42);--stabilise random() between tests
INSERT INTO Measures (expId, iteration, value)
select expId, iteration, round(random()::numeric*10,1)
from generate_series(1,60)expId
join generate_series(1,800)iteration on .6>random();
--each expId randomly tries to get a value for an iteration
--40% of the time it will not get an iteration, therefore no value, no entry
--now below 30k, to let the looped update finish
--array_agg took 370ms on 120k Measures total,
--plpgsql looped update failed without completing
select * from Measures tablesample bernoulli(.42)repeatable(.42)limit 15;
CREATE TABLE
CREATE TABLE
setseed |
---|
SELECT 1
INSERT 0 28716
expid | iteration | value |
---|---|---|
1 | 187 | 0.5 |
2 | 242 | 4.6 |
2 | 464 | 8.7 |
2 | 710 | 6.8 |
3 | 635 | 9.2 |
5 | 450 | 5 |
5 | 464 | 5.9 |
6 | 699 | 0 |
8 | 27 | 7.2 |
8 | 467 | 1.4 |
8 | 497 | 4.9 |
12 | 14 | 6.9 |
12 | 112 | 0.3 |
12 | 256 | 2 |
12 | 328 | 1.4 |
SELECT 15
drop table if exists TmpMeasures;
explain analyze verbose
create table if not exists TmpMeasures as
with min_max_iteration as (
select min(iteration),
max(iteration) from Measures)
,distinct_expIds as (
select distinct expId from Measures)
select expId,
array_agg(value order by iteration) as "values"
from min_max_iteration
cross join distinct_expIds d
cross join generate_series(min,max) g(iteration)
left join Measures m using(iteration,ExpId)
group by ExpId;
/*
--needed to set the identity column to the right position
--subselect in alter table won't be allowed
select max(expId)+1 from TmpMeasures;
--add column constraints
alter table TmpMeasures
add primary key (expId)
,alter column expId add generated by default as identity(start with 4)
,alter column values set not null;
--test to see if the generated column works fine, starting from the latest
insert into TmpMeasures(values) select '{1,2,3,4}' returning *;
*/
DROP TABLE
QUERY PLAN |
---|
GroupAggregate (cost=26740.47..28242.97 rows=200 width=36) (actual time=79.905..90.117 rows=60 loops=1) |
Output: measures_1.expid, array_agg(m.value ORDER BY g.iteration) |
Group Key: measures_1.expid |
-> Sort (cost=26740.47..27240.47 rows=200000 width=12) (actual time=79.688..82.897 rows=48000 loops=1) |
Output: measures_1.expid, m.value, g.iteration |
Sort Key: measures_1.expid, g.iteration |
Sort Method: quicksort Memory: 3261kB |
-> Hash Left Join (cost=2138.76..5711.33 rows=200000 width=12) (actual time=23.440..45.293 rows=48000 loops=1) |
Output: measures_1.expid, m.value, g.iteration |
Inner Unique: true |
Hash Cond: ((g.iteration = m.iteration) AND (measures_1.expid = m.expid)) |
-> Nested Loop (cost=1187.16..3709.67 rows=200000 width=8) (actual time=12.184..21.588 rows=48000 loops=1) |
Output: g.iteration, measures_1.expid |
-> Nested Loop (cost=633.36..653.37 rows=1000 width=4) (actual time=5.480..5.700 rows=800 loops=1) |
Output: g.iteration |
-> Aggregate (cost=633.36..633.37 rows=1 width=8) (actual time=5.353..5.354 rows=1 loops=1) |
Output: min(measures.iteration), max(measures.iteration) |
-> Seq Scan on public.measures (cost=0.00..474.24 rows=31824 width=4) (actual time=0.012..2.727 rows=28716 loops=1) |
Output: measures.expid, measures.iteration, measures.value |
-> Function Scan on pg_catalog.generate_series g (cost=0.00..10.00 rows=1000 width=4) (actual time=0.124..0.227 rows=800 loops=1) |
Output: g.iteration |
Function Call: generate_series((min(measures.iteration)), (max(measures.iteration))) |
-> Materialize (cost=553.80..556.80 rows=200 width=4) (actual time=0.008..0.012 rows=60 loops=800) |
Output: measures_1.expid |
-> HashAggregate (cost=553.80..555.80 rows=200 width=4) (actual time=6.691..6.702 rows=60 loops=1) |
Output: measures_1.expid |
Group Key: measures_1.expid |
Batches: 1 Memory Usage: 40kB |
-> Seq Scan on public.measures measures_1 (cost=0.00..474.24 rows=31824 width=4) (actual time=0.007..2.309 rows=28716 loops=1) |
Output: measures_1.expid, measures_1.iteration, measures_1.value |
-> Hash (cost=474.24..474.24 rows=31824 width=12) (actual time=10.881..10.882 rows=28716 loops=1) |
Output: m.value, m.iteration, m.expid |
Buckets: 32768 Batches: 1 Memory Usage: 1490kB |
-> Seq Scan on public.measures m (cost=0.00..474.24 rows=31824 width=12) (actual time=0.008..3.698 rows=28716 loops=1) |
Output: m.value, m.iteration, m.expid |
Planning Time: 1.077 ms |
Execution Time: 94.321 ms |
EXPLAIN
create function plpgsql_looped_update() returns void as $do$
BEGIN
FOR i IN 1..(SELECT max(iteration) FROM Measures m) LOOP
UPDATE TmpMeasures tm
SET values[i] = m.value
FROM Measures m
WHERE
tm.expId = m.expId AND
m.iteration=i;
END LOOP;
END $do$ language plpgsql;
explain analyze verbose select plpgsql_looped_update();
CREATE FUNCTION
server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request.