Skip to content

Commit 38d47a3

Browse files
committed
Moved file to audit folder and fixed syntax
1 parent d3c25f8 commit 38d47a3

File tree

1 file changed

+173
-0
lines changed

1 file changed

+173
-0
lines changed
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
/*
2+
* BigQuery Table Access and Usage Analysis Script
3+
*
4+
* This script analyzes BigQuery job history to determine when tables were last accessed
5+
* and provides insights into Data Manipulation Language (DML) and Data Definition Language (DDL)
6+
* operations performed on those tables.
7+
*
8+
* It helps identify:
9+
* - The last time a table was referenced in a query (last accessed time).
10+
* - The frequency of SELECT, INSERT, UPDATE, DELETE, TRUNCATE, and other DDL/DML operations.
11+
*
12+
* This information is valuable for:
13+
* - Identifying unused tables for potential deletion to optimize storage costs.
14+
* - Understanding table usage patterns for performance tuning and resource allocation.
15+
*/
16+
17+
18+
-- Set 'first_load' to TRUE for the initial run to scan the last 180 days of job history.
19+
-- Set 'first_load' to FALSE for subsequent runs to scan the last 2 days.
20+
declare first_load bool default false;
21+
declare days_to_scan int64 default if(first_load, 180, 2);
22+
23+
-- Update the admin project beloe
24+
declare admin_project string default '<default-admin-project>';
25+
26+
-- the BQ greatest function does not exclude nulls from comparison
27+
-- below function mimics the greatest function, but excludes nulls.
28+
create temp function fn_greatest(arr any type) as (
29+
(select max(x) x_max
30+
from unnest(arr) x
31+
where x is not null)
32+
);
33+
34+
-- extract relevant details from information_schema.jobs
35+
create or replace temp table `temp_jobs_referenced_tables` as
36+
select creation_time, statement_type, referenced_tables, destination_table
37+
from `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
38+
where regexp_contains(reservation_id, admin_project)
39+
and date(creation_time) >= current_date - days_to_scan;
40+
41+
-- create table to store the table access details
42+
create table if not exists `optimization_workshop.tables_access_summary`
43+
(
44+
report_date date
45+
, table_catalog string
46+
, table_schema string
47+
, table_name string
48+
, last_accessed_time timestamp
49+
, select_count int64
50+
, update_count int64
51+
, delete_count int64
52+
, insert_count int64
53+
, merge_count int64
54+
, truncate_table_count int64
55+
, create_table_count int64
56+
, create_table_as_select_count int64
57+
, create_view_count int64
58+
, load_data_count int64
59+
, materialized_count int64
60+
, drop_table_count int64
61+
, create_snapshot_table_count int64
62+
)
63+
partition by report_date
64+
cluster by table_catalog, table_schema, table_name;
65+
66+
-- delete existing data for the scanned date range
67+
delete from `optimization_workshop.tables_access_summary`
68+
where report_date >= current_date - days_to_scan;
69+
70+
-- populate the table
71+
insert into `optimization_workshop.tables_access_summary`
72+
with ref_tables as (
73+
select
74+
date(creation_time) report_date
75+
, ref_tables.project_id
76+
, ref_tables.dataset_id
77+
, ref_tables.table_id
78+
, max(creation_time) last_accessed_time
79+
-- tables under the referenced_tables will only be part of select/from
80+
, count(1) as select_count
81+
, 0 update_count
82+
, 0 delete_count
83+
, 0 insert_count
84+
, 0 merge_count
85+
, 0 truncate_table_count
86+
, 0 create_table_count
87+
, 0 create_table_as_select_count
88+
, 0 create_view_count
89+
, 0 load_data_count
90+
, 0 materialized_count
91+
, 0 drop_table_count
92+
, 0 create_snapshot_table_count
93+
from `optimization_workshop.tables_access_summary`
94+
, unnest(referenced_tables) as ref_tables
95+
group by all
96+
), dest_tables as (
97+
select
98+
date(creation_time) report_date
99+
, destination_table.project_id
100+
, destination_table.dataset_id , destination_table.table_id , max(creation_time) last_accessed_time
101+
, 0 as select_count
102+
, sum(case when statement_type = 'UPDATE' then 1 else 0 end) update_count
103+
, sum(case when statement_type = 'DELETE' then 1 else 0 end) delete_count
104+
, sum(case when statement_type = 'INSERT' then 1 else 0 end) insert_count
105+
, sum(case when statement_type = 'MERGE' then 1 else 0 end) merge_count
106+
, sum(case when statement_type = 'TRUNCATE_TABLE' then 1 else 0 end) truncate_table_count
107+
, sum(case when statement_type = 'CREATE_TABLE' then 1 else 0 end) create_table_count
108+
, sum(case when statement_type = 'CREATE_TABLE_AS_SELECT' then 1 else 0 end) create_table_as_select_count
109+
, sum(case when statement_type = 'CREATE_VIEW' then 1 else 0 end) create_view_count
110+
, sum(case when statement_type = 'LOAD_DATA' then 1 else 0 end) load_data_count
111+
, sum(case when statement_type = 'CREATE_MATERIALIZED_VIEW' then 1 else 0 end) materialized_count
112+
, sum(case when statement_type = 'DROP_TABLE' then 1 else 0 end) drop_table_count
113+
, sum(case when statement_type = 'CREATE_SNAPSHOT_TABLE' then 1 else 0 end) create_snapshot_table_count
114+
from `optimization_workshop.tables_access_summary`
115+
-- every select statement will have a temp table as destination, skip it
116+
where statement_type not in ('SELECT')
117+
group by all
118+
)
119+
select
120+
coalesce(r.report_date, d.report_date) report_date
121+
, coalesce(r.project_id, d.project_id) table_catalog
122+
, coalesce(r.dataset_id, d.dataset_id) table_schema
123+
, coalesce(r.table_id , d.table_id) table_name
124+
, fn_greatest([r.last_accessed_time, d.last_accessed_time]) as last_accessed_time
125+
, r.select_count select_count
126+
, d.update_count
127+
, d.delete_count
128+
, d.insert_count
129+
, d.merge_count
130+
, d.truncate_table_count
131+
, d.create_table_count
132+
, d.create_table_as_select_count
133+
, d.create_view_count
134+
, d.load_data_count
135+
, d.materialized_count
136+
, d.drop_table_count
137+
, d.create_snapshot_table_count
138+
from ref_tables r full outer join dest_tables d
139+
on r.report_date = d.report_date
140+
and r.project_id = d.project_id
141+
and r.dataset_id = d.dataset_id
142+
and r.table_id = d.table_id;
143+
144+
145+
-- DML and DDL counts on a table for last 30 days
146+
select table_catalog
147+
, table_schema
148+
, table_name
149+
, sum(select_count) select_count
150+
, sum(update_count) update_count
151+
, sum(delete_count) delete_count
152+
, sum(insert_count) insert_count
153+
, sum(merge_count) merge_count
154+
, sum(truncate_table_count) truncate_table_count
155+
, sum(create_table_count) create_table_count
156+
, sum(create_table_as_select_count) create_table_as_select_count
157+
, sum(create_view_count) create_view_count
158+
, sum(load_data_count) load_data_count
159+
, sum(materialized_count) materialized_count
160+
, sum(drop_table_count) drop_table_count
161+
, sum(create_snapshot_table_count) create_snapshot_table_count
162+
from `optimization_workshop.tables_access_summary`
163+
where report_date > current_date - 30
164+
group by all;
165+
166+
167+
-- table last accessed
168+
select table_catalog
169+
, table_name
170+
, max(last_accessed_time) last_accessed_time
171+
from `optimization_workshop.tables_access_summary`
172+
group by all;
173+

0 commit comments

Comments
 (0)