Postgres 15

By using db<>fiddle, you agree to license everything you submit by Creative Commons CC0.

Help with an interesting Postgres question: Why isn't an Index Only Scan used on a partition accessed via the parent table?.

create extension if not exists postgis;

select setseed(.42);

drop table if exists "potential_missed_areas";

create table "potential_missed_areas"(

"ID" bigint generated by default as identity primary key

,"areaID" int default random()*1e2

,"campaignid" int default -1

,"status" text default 'approved'

,"metaData" jsonb

,"createdAt" timestamptz default now()-'1 week'::interval-random()*'1 year'::interval

,"updatedAt" timestamptz default now()-random()*'1 week'::interval

,"centerPoint" geometry(Point,4326) default st_makepoint( 60.86+random()*(77.83-60.86)

,23.64+random()*(37.07-23.64) )

);

--WGS84 bounds of 24313: x60.86y23.64 - 77.83x37.07y

insert into "potential_missed_areas"

select from generate_series(1,4e3);

--on a surface area this large, 4k points is too few

--20km radius is too short to form any large clusters given completely random points

select "ID","areaID","campaignid","status"

,"metaData","createdAt","updatedAt",st_astext("centerPoint")

from "potential_missed_areas"

tablesample bernoulli(.42)repeatable(.42)limit 12;

WITH clustered_data AS (-- Step 1: Perform initial clustering

SELECT pma.*

, ST_ClusterKMeans(ST_Transform("centerPoint", 24313), 1, 20000)

OVER (PARTITION BY "areaID") AS "nta_cluster_id" -- Initial clustering

, 1 AS "recursion_depth" -- Initialize recursion depth

FROM "potential_missed_areas" pma

WHERE pma."campaignid" = -1

AND pma."status" = 'approved'

)SELECT "areaID"

, "nta_cluster_id"

CREATE EXTENSION

setseed

SELECT 1

DROP TABLE

CREATE TABLE

INSERT 0 4000

ID	areaID	campaignid	status	metaData	createdAt	updatedAt	st_astext
286	75	-1	approved	null	2024-04-03 19:59:23.258174+01	2024-09-19 09:28:57.330468+01	POINT(64.58429284130493 23.753412896361144)
546	30	-1	approved	null	2024-09-04 00:24:48.160574+01	2024-09-23 23:01:18.055657+01	POINT(77.79583208123906 36.381533963493965)
1137	65	-1	approved	null	2023-12-03 17:59:20.262974+00	2024-09-21 23:09:29.015759+01	POINT(67.25468081016629 28.313902495418464)
1356	47	-1	approved	null	2023-11-27 03:29:17.728574+00	2024-09-21 15:43:34.027639+01	POINT(77.22719985075432 25.365093641778945)
2218	55	-1	approved	null	2024-02-17 23:02:02.406974+00	2024-09-20 04:14:50.665775+01	POINT(76.48835819631744 33.889179693474205)
2572	21	-1	approved	null	2024-03-13 23:49:19.178174+00	2024-09-20 08:36:17.347207+01	POINT(61.17018991318976 36.17807579830907)
3284	77	-1	approved	null	2023-11-14 03:18:36.381374+00	2024-09-20 13:35:55.397669+01	POINT(70.44498443014285 25.508654918616426)
3535	28	-1	approved	null	2024-01-09 12:35:22.768574+00	2024-09-23 19:41:18.455056+01	POINT(70.84526728098265 24.68004435652911)

SELECT 8

areaID	nta_cluster_id	cluster_size
92	1	3
82	11	2
78	31	2
68	5	2
82	23	2

SELECT 5

DROP TABLE IF EXISTS clustered_points;

CREATE TABLE clustered_points AS

WITH clustered_data AS (-- Step 1: Perform initial clustering

SELECT pma.*

, ST_ClusterKMeans(ST_Transform("centerPoint", 24313), 1, 2e4)

OVER (PARTITION BY "areaID") AS "nta_cluster_id" -- Initial clustering

FROM "potential_missed_areas" pma

WHERE pma."campaignid" = -1

AND pma."status" = 'approved'

),cluster_sizes AS (-- Step 2: Calculate cluster sizes

SELECT "areaID"

, "nta_cluster_id"

, COUNT(*)::int AS "cluster_size"

FROM clustered_data

GROUP BY 1,2

),dataset as (-- Final selection

SELECT *

, ST_ClusterKMeans( ST_Transform("centerPoint", 24313)

,GREATEST(1,CEIL("cluster_size" / 20.0))::int)

OVER (PARTITION BY "areaID", "nta_cluster_id") AS "new_cluster_id" -- Re-cluster based on size

FROM clustered_data

JOIN cluster_sizes USING("areaID","nta_cluster_id")

)SELECT *

, format( '"areaID":%s,"nta_cluster_id":%s,"new_cluster_id":%s'

,"areaID","nta_cluster_id","new_cluster_id") as "full_cluster_identifier"

, rank()over(order by "areaID","nta_cluster_id","new_cluster_id") as "final_cluster_id"

FROM dataset;

drop table if exists cluster_shapes;

create table cluster_shapes as

select min("full_cluster_identifier") as "full_cluster_identifier"

, min("final_cluster_id") as "final_cluster_id"

, min("cluster_size") as "cluster_size"

--, st_optimalalphashape(st_collect("centerPoint")) as geom

, st_convexhull(st_collect("centerPoint")) as geom

from clustered_points

DROP TABLE

SELECT 4000

DROP TABLE

SELECT 0