[Edit] : this answer helped DELETE FROM Table WHERE ‘RANK’ > 1
I have a table which keeps track of some entity’s status history (tb_status)
DDL
CREATE TABLE table_aa ( id varchar(255) NOT NULL, -- other columns CONSTRAINT table_aa_pkey PRIMARY KEY (id) ); CREATE TABLE table_ab ( id int4 NOT NULL, ref_entity_a varchar(255) NOT NULL, -- other columns CONSTRAINT table_ab_pkey PRIMARY KEY (id, ref_entity_a), CONSTRAINT fk_entity_a FOREIGN KEY (ref_entity_a) REFERENCES table_aa(id) ); CREATE TABLE tb_status ( status_id int8 NOT NULL, ref_entity_aa varchar(255) NOT NULL, ref_entity_ab int4 NOT NULL, insert_timestamp timestamptz NOT NULL, status varchar(255) NULL, -- this is an enumeration -- other columns CONSTRAINT estatus_pkey PRIMARY KEY (ref_entity_aa, ref_entity_ab, status_id), CONSTRAINT fk_entity_aa FOREIGN KEY (ref_entity_aa) REFERENCES table_aa(id), CONSTRAINT fk_entity_ab FOREIGN KEY (ref_entity_aa, ref_entity_ab) REFERENCES table_ab(ref_entity_a,id) );
here is an example
status_id | ref_entity_aa | ref_entity_ab | status |
---|---|---|---|
1 | “a” | 1 | OK |
1 | “a” | 2 | OK |
1 | “b” | 1 | OK |
1 | “b” | 2 | OK |
2 | “a” | 1 | OK |
2 | “a” | 2 | OK |
2 | “b” | 1 | OK |
2 | “b” | 2 | ERROR |
I want to keep the last n records per primary key per status
so if n was 1 after the delete the table should look like this
status_id | ref_entity_aa | ref_entity_ab | status |
---|---|---|---|
1 | “b” | 2 | OK |
2 | “a” | 1 | OK |
2 | “a” | 2 | OK |
2 | “b” | 1 | OK |
2 | “b” | 2 | ERROR |
select ref_entity_aa, ref_entity_ab, status, count(*) from tb_status group by ref_entity_aa, ref_entity_ab, status ;
This group by query returns the number of rows for each group
In the example this would be
ref_entity_aa | ref_entity_ab | status | count |
---|---|---|---|
“a” | 1 | OK | 2 |
“a” | 2 | OK | 2 |
“b” | 1 | OK | 2 |
“b” | 2 | OK | 1 |
“b” | 2 | ERROR | 1 |
After the delete query executes the group by query should return
ref_entity_aa | ref_entity_ab | status | count |
---|---|---|---|
“a” | 1 | OK | 1 |
“a” | 2 | OK | 1 |
“b” | 1 | OK | 1 |
“b” | 2 | OK | 1 |
“b” | 2 | ERROR | 1 |
The goal is to delete old records (i.e. records that have lowest value of status_id ) so that this group by query returns a count <= n
I was able to come up with this query
delete from tb_status as ts where (ts.ref_entity_aa || ts.ref_entity_ab || ts.status_id) in ( -- the same constructed_id select ranked_query.constructed_id from ( select (ts.ref_entity_aa || ts.ref_entity_ab || ts.status_id) as constructed_id, rank() over (partition by ts.ref_entity_aa, ts.ref_entity_ab, ts.status order by ts.status_id desc) as ranking from tb_status as ts ) as ranked_query where ranked_query.ranking > :numberOfRecordsToKeep -- the n in the question );
Advertisement
Answer
I use a rank() window function to determine which record is older (ordered descending). Then I use a subquery to isolate the primary key that identifies the record to be deleted. This array of primary keys can be used in a WHERE clause to delete the records based on the primary key defined in the subquery.
IF OBJECT_ID('TEMPDB..#TEMP') IS NOT NULL DROP TABLE #TEMP CREATE TABLE #TEMP ( PrimKey INT ,status_id INT ,ref_entity_aa NVARCHAR(10) ,ref_entity_ab INT ,[status] NVARCHAR(10) ); INSERT INTO #TEMP (PrimKey,status_id,ref_entity_aa,ref_entity_ab,[status]) VALUES (1000,1,'a',1,'OK') ,(1001,1,'a',2,'OK') ,(1002,1,'b',1,'OK') ,(1003,1,'b',2,'OK') ,(1004,2,'a',1,'OK') ,(1005,2,'a',2,'OK') ,(1006,2,'b',1,'OK') ,(1007,2,'b',2,'ERROR') SELECT * FROM #TEMP DELETE #TEMP WHERE Primkey NOT IN ( SELECT Primkey FROM( SELECT PrimKey ,status_id ,RANK() OVER(PARTITION BY ref_entity_aa,ref_entity_ab,[status] ORDER BY status_id DESC) [rank] ,ref_entity_aa ,ref_entity_ab ,[status] FROM #TEMP )A WHERE [rank] <= 1 --'N' ) SELECT * FROM #TEMP
Let me know if this works!
OUTPUT BEFORE AND AFTER DELETE:
Or use this solution if you do not have a primary key. You can calculate one with your composite key columns and do the same idea as above. Just keep in mind the numerical datatypes will need to be converted to nvarchar() when there are NVARCHAR columns involved.
SELECT --Before Delete CAST(T.[status_id] AS NVARCHAR(1))+T.ref_entity_aa+CAST(T.ref_entity_ab AS NVARCHAR(1))+T.[status] ,* FROM #TEMP T DELETE #TEMP WHERE CAST([status_id] AS NVARCHAR(1))+ref_entity_aa+CAST(ref_entity_ab AS NVARCHAR(1))+[status] NOT IN ( SELECT CAST([status_id] AS NVARCHAR(1))+ref_entity_aa+CAST(ref_entity_ab AS NVARCHAR(1))+[status] FROM( SELECT status_id ,RANK() OVER(PARTITION BY ref_entity_aa,ref_entity_ab,[status] ORDER BY status_id DESC) [rank] ,ref_entity_aa ,ref_entity_ab ,[status] FROM #TEMP )A WHERE [rank] <= 1 --'N' ) SELECT --After Delete CAST([status_id] AS NVARCHAR(1))+ref_entity_aa+CAST(ref_entity_ab AS NVARCHAR(1))+[status] ,T.ref_entity_aa ,T.ref_entity_ab ,T.[status] FROM #TEMP t