By using db<>fiddle, you agree to license everything you submit by Creative Commons CC0.
Help with an interesting Postgres question: Why isn't an Index Only Scan used on a partition accessed via the parent table?.
create extension if not exists postgis;
select setseed(.42);
drop table if exists "potential_missed_areas";
create table "potential_missed_areas"(
"ID" bigint generated by default as identity primary key
,"areaID" int default random()*1e2
,"campaignid" int default -1
,"status" text default 'approved'
,"metaData" jsonb
,"createdAt" timestamptz default now()-'1 week'::interval-random()*'1 year'::interval
,"updatedAt" timestamptz default now()-random()*'1 week'::interval
,"centerPoint" geometry(Point,4326) default st_makepoint( 60.86+random()*(77.83-60.86)
,23.64+random()*(37.07-23.64) )
);
--WGS84 bounds of 24313: x60.86y23.64 - 77.83x37.07y
insert into "potential_missed_areas"
select from generate_series(1,4e3);
--on a surface area this large, 4k points is too few
--20km radius is too short to form any large clusters given completely random points
select "ID","areaID","campaignid","status"
,"metaData","createdAt","updatedAt",st_astext("centerPoint")
from "potential_missed_areas"
tablesample bernoulli(.42)repeatable(.42)limit 12;
WITH clustered_data AS (-- Step 1: Perform initial clustering
SELECT pma.*
, ST_ClusterKMeans(ST_Transform("centerPoint", 24313), 1, 20000)
OVER (PARTITION BY "areaID") AS "nta_cluster_id" -- Initial clustering
, 1 AS "recursion_depth" -- Initialize recursion depth
FROM "potential_missed_areas" pma
WHERE pma."campaignid" = -1
AND pma."status" = 'approved'
)SELECT "areaID"
, "nta_cluster_id"
CREATE EXTENSION
setseed |
---|
SELECT 1
DROP TABLE
CREATE TABLE
INSERT 0 4000
ID | areaID | campaignid | status | metaData | createdAt | updatedAt | st_astext |
---|---|---|---|---|---|---|---|
286 | 75 | -1 | approved | null | 2024-04-03 19:59:23.258174+01 | 2024-09-19 09:28:57.330468+01 | POINT(64.58429284130493 23.753412896361144) |
546 | 30 | -1 | approved | null | 2024-09-04 00:24:48.160574+01 | 2024-09-23 23:01:18.055657+01 | POINT(77.79583208123906 36.381533963493965) |
1137 | 65 | -1 | approved | null | 2023-12-03 17:59:20.262974+00 | 2024-09-21 23:09:29.015759+01 | POINT(67.25468081016629 28.313902495418464) |
1356 | 47 | -1 | approved | null | 2023-11-27 03:29:17.728574+00 | 2024-09-21 15:43:34.027639+01 | POINT(77.22719985075432 25.365093641778945) |
2218 | 55 | -1 | approved | null | 2024-02-17 23:02:02.406974+00 | 2024-09-20 04:14:50.665775+01 | POINT(76.48835819631744 33.889179693474205) |
2572 | 21 | -1 | approved | null | 2024-03-13 23:49:19.178174+00 | 2024-09-20 08:36:17.347207+01 | POINT(61.17018991318976 36.17807579830907) |
3284 | 77 | -1 | approved | null | 2023-11-14 03:18:36.381374+00 | 2024-09-20 13:35:55.397669+01 | POINT(70.44498443014285 25.508654918616426) |
3535 | 28 | -1 | approved | null | 2024-01-09 12:35:22.768574+00 | 2024-09-23 19:41:18.455056+01 | POINT(70.84526728098265 24.68004435652911) |
SELECT 8
areaID | nta_cluster_id | cluster_size |
---|---|---|
92 | 1 | 3 |
82 | 11 | 2 |
78 | 31 | 2 |
68 | 5 | 2 |
82 | 23 | 2 |
SELECT 5
DROP TABLE IF EXISTS clustered_points;
CREATE TABLE clustered_points AS
WITH clustered_data AS (-- Step 1: Perform initial clustering
SELECT pma.*
, ST_ClusterKMeans(ST_Transform("centerPoint", 24313), 1, 2e4)
OVER (PARTITION BY "areaID") AS "nta_cluster_id" -- Initial clustering
FROM "potential_missed_areas" pma
WHERE pma."campaignid" = -1
AND pma."status" = 'approved'
),cluster_sizes AS (-- Step 2: Calculate cluster sizes
SELECT "areaID"
, "nta_cluster_id"
, COUNT(*)::int AS "cluster_size"
FROM clustered_data
GROUP BY 1,2
),dataset as (-- Final selection
SELECT *
, ST_ClusterKMeans( ST_Transform("centerPoint", 24313)
,GREATEST(1,CEIL("cluster_size" / 20.0))::int)
OVER (PARTITION BY "areaID", "nta_cluster_id") AS "new_cluster_id" -- Re-cluster based on size
FROM clustered_data
JOIN cluster_sizes USING("areaID","nta_cluster_id")
)SELECT *
, format( '"areaID":%s,"nta_cluster_id":%s,"new_cluster_id":%s'
,"areaID","nta_cluster_id","new_cluster_id") as "full_cluster_identifier"
, rank()over(order by "areaID","nta_cluster_id","new_cluster_id") as "final_cluster_id"
FROM dataset;
drop table if exists cluster_shapes;
create table cluster_shapes as
select min("full_cluster_identifier") as "full_cluster_identifier"
, min("final_cluster_id") as "final_cluster_id"
, min("cluster_size") as "cluster_size"
--, st_optimalalphashape(st_collect("centerPoint")) as geom
, st_convexhull(st_collect("centerPoint")) as geom
from clustered_points
DROP TABLE
SELECT 4000
DROP TABLE
SELECT 0