From 9fc4cf4278db6e47046e817f976743477dc20aa1 Mon Sep 17 00:00:00 2001 From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 8 Jun 2023 14:58:44 -0400 Subject: [PATCH 01/10] initial commit for generate_sql_readme.py --- bigdata_schema_readmes/generate_sql_readme.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 bigdata_schema_readmes/generate_sql_readme.py diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py new file mode 100644 index 0000000..4c13321 --- /dev/null +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -0,0 +1,95 @@ +import pandas +import os.path +from pathlib import Path +import configparser +from psycopg2 import connect + +CONFIG = configparser.ConfigParser() +CONFIG.read(str(Path.home().joinpath('db.cfg'))) #Creates a path to your db.cfg file +dbset = CONFIG['DBSETTINGS'] +con = connect(**dbset) + +###################### +##schema name goes here +###################### +schema_name = 'rescu' + +#find table names from information_schema.tables +table_sql = ''' +SELECT table_name +FROM information_schema.tables +WHERE table_schema = '{}' + AND table_type <> 'VIEW'; +''' + +#find column names and types from information_schema.columns +columns_sql = ''' +SELECT column_name, data_type +FROM information_schema.columns +WHERE table_schema = '{}' + AND table_name = '{}'; +''' + +#first row of table as sample +sample_sql = ''' +SELECT * +FROM {}.{} +LIMIT 1; +''' + +#first row of table as sample +rowcount_sql = ''' +SELECT COUNT(1) +FROM {}.{}; +''' + +#create directory if not exists +#home folder +dir = "bigdata_schema_readmes" +if os.path.exists(dir) is False: + os.mkdir(dir) + print("Creating directory: {}".format(dir)) + +#remove file if exists +fname = dir + "/{}_readme.txt".format(schema_name) +if os.path.isfile(fname): + os.remove(fname) + +with con: + #identify tables within schema + tables = pandas.read_sql(table_sql.format(schema_name), con) + + if tables.empty: + print("No tables found in schema '{}'".format(schema_name)) + + #for each table + for table_name in tables['table_name']: + + #query columns & datatypes from information_schema + column_types = pandas.read_sql(columns_sql.format(schema_name, table_name), con) + + #query sample row from schema.table and transpose + data_sample = pandas.read_sql(sample_sql.format(schema_name, table_name), con) + data_sample_T = data_sample.T + data_sample_T["column_name"] = data_sample_T.index + data_sample_T.rename(columns= {0: "sample"}, inplace=True) + + #row count + row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con) + + #merge sample with column types + final = column_types.merge(data_sample_T, on = 'column_name') + final['Comments'] = '' #blank column for comments + + #markdown format for github + final_formatted = final.to_markdown(index = False) + + #print for debugging + print(final_formatted) + + #write formatted output with table name as header + with open(fname, "a") as file: #append + file.write("{}.{}\n".format(schema_name, table_name) + + "Row count: {:,}\n".format(row_count['count'][0]) + + final_formatted + + "\n\n") From f1abfde5735fd09883082f90b2e6b8836a982c1b Mon Sep 17 00:00:00 2001 From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 8 Jun 2023 15:14:55 -0400 Subject: [PATCH 02/10] Added column comments --- bigdata_schema_readmes/generate_sql_readme.py | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index 4c13321..6824094 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -12,7 +12,7 @@ ###################### ##schema name goes here ###################### -schema_name = 'rescu' +schema_name = 'wys' #find table names from information_schema.tables table_sql = ''' @@ -43,6 +43,23 @@ FROM {}.{}; ''' +column_comments_sql = ''' + SELECT + c.column_name, + pgd.description + FROM pg_catalog.pg_statio_all_tables AS st + INNER JOIN pg_catalog.pg_description AS pgd ON ( + pgd.objoid = st.relid + ) + INNER JOIN information_schema.columns AS c ON ( + pgd.objsubid = c.ordinal_position + AND c.table_schema = st.schemaname + AND c.table_name = st.relname + ) + WHERE c.table_schema = '{}' + AND c.table_name = '{}'; +''' + #create directory if not exists #home folder dir = "bigdata_schema_readmes" @@ -77,9 +94,13 @@ #row count row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con) + #column comments + column_comments = pandas.read_sql(column_comments_sql.format(schema_name, table_name), con) + #merge sample with column types - final = column_types.merge(data_sample_T, on = 'column_name') - final['Comments'] = '' #blank column for comments + final = column_types.merge(data_sample_T, how = 'left', on = 'column_name') + final = final.merge(column_comments, how = 'left', on = 'column_name') + final['description'] = final['description'].fillna('') #markdown format for github final_formatted = final.to_markdown(index = False) From 5108263bf7e9375db10a0b2a32b4bd6b4ab16a32 Mon Sep 17 00:00:00 2001 From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 8 Jun 2023 15:58:53 -0400 Subject: [PATCH 03/10] added column comments --- bigdata_schema_readmes/generate_sql_readme.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index 6824094..da64906 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -12,7 +12,8 @@ ###################### ##schema name goes here ###################### -schema_name = 'wys' +schema_name = 'miovision_api' +row_count_on = True #change to false to omit row counts (can be very slow on certain schemas) #find table names from information_schema.tables table_sql = ''' @@ -91,10 +92,11 @@ data_sample_T["column_name"] = data_sample_T.index data_sample_T.rename(columns= {0: "sample"}, inplace=True) - #row count - row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con) + #row count + if row_count_on: + row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con) - #column comments + #column comments --tested with miovision_api (has 3 column comments) column_comments = pandas.read_sql(column_comments_sql.format(schema_name, table_name), con) #merge sample with column types @@ -110,7 +112,7 @@ #write formatted output with table name as header with open(fname, "a") as file: #append - file.write("{}.{}\n".format(schema_name, table_name) + - "Row count: {:,}\n".format(row_count['count'][0]) + - final_formatted + - "\n\n") + file.write("{}.{}\n".format(schema_name, table_name)) + if(row_count_on): + file.write("Row count: {:,}\n".format(row_count['count'][0])) + file.write(final_formatted + "\n\n") \ No newline at end of file From d5fc506c2a1f57d91b99b75cef4b76fa479f1b9a Mon Sep 17 00:00:00 2001 From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:25:08 -0400 Subject: [PATCH 04/10] Fix path for use in non-home dir. Add user input. --- bigdata_schema_readmes/generate_sql_readme.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index da64906..72f0f8c 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -4,16 +4,20 @@ import configparser from psycopg2 import connect +home_dir = os.path.expanduser('~') + CONFIG = configparser.ConfigParser() -CONFIG.read(str(Path.home().joinpath('db.cfg'))) #Creates a path to your db.cfg file +CONFIG.read(os.path.join(home_dir, 'db.cfg')) #Creates a path to your db.cfg file dbset = CONFIG['DBSETTINGS'] con = connect(**dbset) ###################### ##schema name goes here ###################### -schema_name = 'miovision_api' -row_count_on = True #change to false to omit row counts (can be very slow on certain schemas) +schema_name = input("Input schema name to generate schema readme for:") +#schema_name = 'rescu' +row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.") +#row_count_on = True #change to false to omit row counts (can be very slow on certain schemas) #find table names from information_schema.tables table_sql = ''' @@ -63,7 +67,7 @@ #create directory if not exists #home folder -dir = "bigdata_schema_readmes" +dir = home_dir + "/bigdata_schema_readmes" if os.path.exists(dir) is False: os.mkdir(dir) print("Creating directory: {}".format(dir)) @@ -73,6 +77,8 @@ if os.path.isfile(fname): os.remove(fname) +print("Destination path: " + fname) + with con: #identify tables within schema tables = pandas.read_sql(table_sql.format(schema_name), con) From 630f0173f33ee1734f2319ea0c16e3bb3822defc Mon Sep 17 00:00:00 2001 From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 19 Jun 2023 17:34:17 -0400 Subject: [PATCH 05/10] Fix issues with SQL string composition --- bigdata_schema_readmes/generate_sql_readme.py | 68 +++++++++++-------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index 72f0f8c..4cd56d4 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -1,8 +1,8 @@ -import pandas +import pandas as pd import os.path -from pathlib import Path import configparser from psycopg2 import connect +from psycopg2 import sql home_dir = os.path.expanduser('~') @@ -14,41 +14,41 @@ ###################### ##schema name goes here ###################### -schema_name = input("Input schema name to generate schema readme for:") +schema_name = input("Input schema name to generate schema readme for:") #schema_name = 'rescu' row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.") #row_count_on = True #change to false to omit row counts (can be very slow on certain schemas) #find table names from information_schema.tables -table_sql = ''' +table_sql = sql.SQL(''' SELECT table_name FROM information_schema.tables -WHERE table_schema = '{}' +WHERE table_schema = {schema} AND table_type <> 'VIEW'; -''' +''') #find column names and types from information_schema.columns -columns_sql = ''' +columns_sql = sql.SQL(''' SELECT column_name, data_type FROM information_schema.columns -WHERE table_schema = '{}' - AND table_name = '{}'; -''' +WHERE table_schema = {schema} + AND table_name = {table}; +''') #first row of table as sample -sample_sql = ''' +sample_sql = sql.SQL(''' SELECT * -FROM {}.{} +FROM {schema}.{table} LIMIT 1; -''' +''') #first row of table as sample -rowcount_sql = ''' +rowcount_sql = sql.SQL(''' SELECT COUNT(1) -FROM {}.{}; -''' +FROM {schema}.{table}; +''') -column_comments_sql = ''' +column_comments_sql = sql.SQL(''' SELECT c.column_name, pgd.description @@ -61,9 +61,9 @@ AND c.table_schema = st.schemaname AND c.table_name = st.relname ) - WHERE c.table_schema = '{}' - AND c.table_name = '{}'; -''' + WHERE c.table_schema = {schema} + AND c.table_name = {table}; +''') #create directory if not exists #home folder @@ -80,30 +80,40 @@ print("Destination path: " + fname) with con: - #identify tables within schema - tables = pandas.read_sql(table_sql.format(schema_name), con) + #identify tables within schema + tables = pd.read_sql_query(table_sql.format( + schema = sql.Literal(schema_name)), con) + if tables.empty: - print("No tables found in schema '{}'".format(schema_name)) + print(f"No tables found in schema '{schema_name}'") #for each table for table_name in tables['table_name']: #query columns & datatypes from information_schema - column_types = pandas.read_sql(columns_sql.format(schema_name, table_name), con) + column_types = pd.read_sql(columns_sql.format( + schema = sql.Literal(schema_name), + table = sql.Literal(table_name)), con) #query sample row from schema.table and transpose - data_sample = pandas.read_sql(sample_sql.format(schema_name, table_name), con) + data_sample = pd.read_sql(sample_sql.format( + schema = sql.Identifier(schema_name), + table = sql.Identifier(table_name)), con) data_sample_T = data_sample.T data_sample_T["column_name"] = data_sample_T.index data_sample_T.rename(columns= {0: "sample"}, inplace=True) #row count if row_count_on: - row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con) + row_count = pd.read_sql(rowcount_sql.format( + schema = sql.Identifier(schema_name), + table = sql.Identifier(table_name)), con) #column comments --tested with miovision_api (has 3 column comments) - column_comments = pandas.read_sql(column_comments_sql.format(schema_name, table_name), con) + column_comments = pd.read_sql(column_comments_sql.format( + schema = sql.Literal(schema_name), + table = sql.Literal(table_name)), con) #merge sample with column types final = column_types.merge(data_sample_T, how = 'left', on = 'column_name') @@ -121,4 +131,6 @@ file.write("{}.{}\n".format(schema_name, table_name)) if(row_count_on): file.write("Row count: {:,}\n".format(row_count['count'][0])) - file.write(final_formatted + "\n\n") \ No newline at end of file + file.write(final_formatted + "\n\n") + +print(f"File path of output: {fname}") \ No newline at end of file From e992384d407875f7190569eb815d67fdbb1f2af5 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 13 May 2024 17:24:17 -0400 Subject: [PATCH 06/10] updates to generate_sql_readme --- bigdata_schema_readmes/generate_sql_readme.py | 173 ++++++++++-------- 1 file changed, 95 insertions(+), 78 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index 4cd56d4..44d90a7 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -1,72 +1,102 @@ import pandas as pd import os.path import configparser -from psycopg2 import connect -from psycopg2 import sql +import sqlalchemy home_dir = os.path.expanduser('~') - + CONFIG = configparser.ConfigParser() CONFIG.read(os.path.join(home_dir, 'db.cfg')) #Creates a path to your db.cfg file -dbset = CONFIG['DBSETTINGS'] -con = connect(**dbset) + +""" +sqlalchemy cfg format: +[SQLALCHEMY] +host= +database= +username= +password= +""" +dbset = CONFIG['SQLALCHEMY'] +url_object = sqlalchemy.engine.URL.create( + "postgresql+psycopg2", + **dbset +) +engine = sqlalchemy.create_engine(url_object) ###################### ##schema name goes here ###################### schema_name = input("Input schema name to generate schema readme for:") -#schema_name = 'rescu' +#schema_name = 'ecocounter' row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.") #row_count_on = True #change to false to omit row counts (can be very slow on certain schemas) #find table names from information_schema.tables -table_sql = sql.SQL(''' +table_sql = ''' SELECT table_name FROM information_schema.tables -WHERE table_schema = {schema} +WHERE table_schema = '{}' AND table_type <> 'VIEW'; -''') +''' #find column names and types from information_schema.columns -columns_sql = sql.SQL(''' +columns_sql = ''' SELECT column_name, data_type FROM information_schema.columns -WHERE table_schema = {schema} - AND table_name = {table}; -''') +WHERE table_schema = '{}' + AND table_name = '{}'; +''' + +column_comments_sql = ''' +SELECT + a.attname AS column_name, + d.description AS "Comments" +FROM pg_class AS c +JOIN pg_attribute AS a ON c.oid = a.attrelid +JOIN pg_namespace AS n ON n.oid = c.relnamespace +JOIN pg_description AS d ON + d.objoid = c.oid + AND d.objsubid = a.attnum +WHERE + n.nspname = '{}' + AND c.relname = '{}' + AND d.description IS NOT NULL; +''' + +table_comments_sql = ''' +SELECT pgd.description +FROM pg_description AS pgd +JOIN pg_class AS pgc ON pgd.objoid = pgc.oid +JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid +WHERE + pgn.nspname = '{}' + AND pgc.relname = '{}' +''' #first row of table as sample -sample_sql = sql.SQL(''' +sample_sql = ''' SELECT * -FROM {schema}.{table} +FROM {}.{} LIMIT 1; -''') +''' -#first row of table as sample -rowcount_sql = sql.SQL(''' +#rowcount +rowcount_sql = ''' SELECT COUNT(1) -FROM {schema}.{table}; -''') - -column_comments_sql = sql.SQL(''' - SELECT - c.column_name, - pgd.description - FROM pg_catalog.pg_statio_all_tables AS st - INNER JOIN pg_catalog.pg_description AS pgd ON ( - pgd.objoid = st.relid - ) - INNER JOIN information_schema.columns AS c ON ( - pgd.objsubid = c.ordinal_position - AND c.table_schema = st.schemaname - AND c.table_name = st.relname - ) - WHERE c.table_schema = {schema} - AND c.table_name = {table}; -''') +FROM {}.{}; +''' + +#Don't fail if some columns are not in the dataset. +#Source: https://stackoverflow.com/a/62658311 +def custom_dataset(dataset, req_cols): + in_ = [] + if isinstance(dataset, pd.DataFrame): # optional + for col in req_cols: # check for every existing column + if col in dataset.columns: + in_.append(col) # append those that are in (i.e. valid) + return dataset[in_] if in_ else None #create directory if not exists -#home folder dir = home_dir + "/bigdata_schema_readmes" if os.path.exists(dir) is False: os.mkdir(dir) @@ -77,60 +107,47 @@ if os.path.isfile(fname): os.remove(fname) -print("Destination path: " + fname) - -with con: - +with engine.connect() as con: #identify tables within schema - tables = pd.read_sql_query(table_sql.format( - schema = sql.Literal(schema_name)), con) - + tables = pd.read_sql(table_sql, con, params=(schema_name,)) if tables.empty: - print(f"No tables found in schema '{schema_name}'") - + print("No tables found in schema '{}'".format(schema_name)) #for each table - for table_name in tables['table_name']: - + for table_name in tables['table_name']: + print(table_name) #query columns & datatypes from information_schema - column_types = pd.read_sql(columns_sql.format( - schema = sql.Literal(schema_name), - table = sql.Literal(table_name)), con) - + column_types = pd.read_sql(columns_sql, con, params=(schema_name, table_name)) + column_comments = pd.read_sql(column_comments_sql, con, params=(schema_name, table_name)) #query sample row from schema.table and transpose - data_sample = pd.read_sql(sample_sql.format( - schema = sql.Identifier(schema_name), - table = sql.Identifier(table_name)), con) + data_sample = pd.read_sql(sample_sql, con, params=(schema_name, table_name)) data_sample_T = data_sample.T data_sample_T["column_name"] = data_sample_T.index - data_sample_T.rename(columns= {0: "sample"}, inplace=True) - + data_sample_T.rename(columns= {0: "sample"}, inplace=True) + table_comments = pd.read_sql(table_comments_sql, con, params=(schema_name, table_name)) + try: + table_comment = table_comments['description'][0] + except KeyError: + table_comment = '' #row count if row_count_on: - row_count = pd.read_sql(rowcount_sql.format( - schema = sql.Identifier(schema_name), - table = sql.Identifier(table_name)), con) - - #column comments --tested with miovision_api (has 3 column comments) - column_comments = pd.read_sql(column_comments_sql.format( - schema = sql.Literal(schema_name), - table = sql.Literal(table_name)), con) - - #merge sample with column types - final = column_types.merge(data_sample_T, how = 'left', on = 'column_name') - final = final.merge(column_comments, how = 'left', on = 'column_name') - final['description'] = final['description'].fillna('') - + row_count = pd.read_sql(rowcount_sql, con, params=(schema_name, table_name)) + #merge sample with column types, comments + final = column_types.merge(data_sample_T, on = 'column_name') + final = column_comments.merge(final, on = 'column_name', how='right') + #reorder columns + final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments']) + #replace nans + final.fillna('', inplace=True) #markdown format for github - final_formatted = final.to_markdown(index = False) - + final_formatted = final.to_markdown(index = False, tablefmt="github") #print for debugging - print(final_formatted) - + #print(final_formatted) #write formatted output with table name as header with open(fname, "a") as file: #append - file.write("{}.{}\n".format(schema_name, table_name)) + file.write("### `{}.{}`\n".format(schema_name, table_name)) + file.write(f"{table_comment}\n\n") if(row_count_on): file.write("Row count: {:,}\n".format(row_count['count'][0])) file.write(final_formatted + "\n\n") -print(f"File path of output: {fname}") \ No newline at end of file +print(f"File path of output: {fname}") From 1a3fa6c0b40b2985e52eb50e0a0b869316a54843 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 7 Apr 2025 18:59:40 +0000 Subject: [PATCH 07/10] #21 fix up sqlalchemy working version; approx row counts, add object type, prefix filtering --- bigdata_schema_readmes/generate_sql_readme.py | 144 ++++++++++-------- 1 file changed, 82 insertions(+), 62 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index 44d90a7..fd7b754 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -2,6 +2,7 @@ import os.path import configparser import sqlalchemy +from sqlalchemy import sql home_dir = os.path.expanduser('~') @@ -26,65 +27,79 @@ ###################### ##schema name goes here ###################### -schema_name = input("Input schema name to generate schema readme for:") -#schema_name = 'ecocounter' -row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.") -#row_count_on = True #change to false to omit row counts (can be very slow on certain schemas) +input_schema = input("Input schema name to generate schema readme for:") + +# Parse schema and prefix from input +try: + schema_name, table_prefix = input_schema.split('.') +except ValueError: + schema_name = input_schema #find table names from information_schema.tables -table_sql = ''' -SELECT table_name -FROM information_schema.tables -WHERE table_schema = '{}' - AND table_type <> 'VIEW'; -''' +if table_prefix: + table_sql = sql.text(''' + SELECT + c.relname AS table_name, + CASE c.relkind + WHEN 'p' THEN 'partitioned table' + WHEN 'r' THEN 'table' + END AS table_type + FROM pg_catalog.pg_class AS c + JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace + WHERE + n.nspname = :schema + AND c.relname LIKE :prefix + -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html + AND c.relkind = ANY('{p,r}') + AND NOT c.relispartition --exclude child partitions + ORDER BY 1, 2; + ''') +else: + table_sql = sql.text(''' + SELECT + c.relname AS table_name, + CASE c.relkind + WHEN 'p' THEN 'partitioned table' + WHEN 'r' THEN 'table' + END AS table_type + FROM pg_catalog.pg_class AS c + JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace + WHERE + n.nspname = :schema + -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html + AND c.relkind = ANY('{p,r}') + AND NOT c.relispartition --exclude child partitions + ORDER BY 1, 2; + ''') #find column names and types from information_schema.columns -columns_sql = ''' -SELECT column_name, data_type -FROM information_schema.columns -WHERE table_schema = '{}' - AND table_name = '{}'; -''' - -column_comments_sql = ''' +columns_sql = sql.text(''' SELECT - a.attname AS column_name, - d.description AS "Comments" + a.attname AS "Column Name", + d.description AS "Comments", + pg_catalog.format_type(a.atttypid, a.atttypmod) as "Data type" FROM pg_class AS c JOIN pg_attribute AS a ON c.oid = a.attrelid JOIN pg_namespace AS n ON n.oid = c.relnamespace -JOIN pg_description AS d ON +LEFT JOIN pg_description AS d ON d.objoid = c.oid AND d.objsubid = a.attnum WHERE - n.nspname = '{}' - AND c.relname = '{}' - AND d.description IS NOT NULL; -''' + n.nspname = :schema + AND c.relname = :table + AND attisdropped = false + AND attnum >= 1; +''') -table_comments_sql = ''' +table_comments_sql = sql.text(''' SELECT pgd.description FROM pg_description AS pgd JOIN pg_class AS pgc ON pgd.objoid = pgc.oid JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid WHERE - pgn.nspname = '{}' - AND pgc.relname = '{}' -''' - -#first row of table as sample -sample_sql = ''' -SELECT * -FROM {}.{} -LIMIT 1; -''' - -#rowcount -rowcount_sql = ''' -SELECT COUNT(1) -FROM {}.{}; -''' + pgn.nspname = :schema + AND pgc.relname = :table +''') #Don't fail if some columns are not in the dataset. #Source: https://stackoverflow.com/a/62658311 @@ -109,31 +124,35 @@ def custom_dataset(dataset, req_cols): with engine.connect() as con: #identify tables within schema - tables = pd.read_sql(table_sql, con, params=(schema_name,)) + if table_prefix: + tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name, 'prefix': f"{table_prefix}%"}) + else: + tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name}) if tables.empty: print("No tables found in schema '{}'".format(schema_name)) #for each table for table_name in tables['table_name']: - print(table_name) + print(f"Processing {table_name}...") #query columns & datatypes from information_schema - column_types = pd.read_sql(columns_sql, con, params=(schema_name, table_name)) - column_comments = pd.read_sql(column_comments_sql, con, params=(schema_name, table_name)) - #query sample row from schema.table and transpose - data_sample = pd.read_sql(sample_sql, con, params=(schema_name, table_name)) + column_types = pd.read_sql_query(columns_sql, con, params={'schema': schema_name, 'table': table_name}) + #query sample row from schema.table and transpose + sample_query = sql.text(f"SELECT * FROM {schema_name}.{table_name} LIMIT 1") + data_sample = pd.read_sql_query(sample_query, con) data_sample_T = data_sample.T - data_sample_T["column_name"] = data_sample_T.index - data_sample_T.rename(columns= {0: "sample"}, inplace=True) - table_comments = pd.read_sql(table_comments_sql, con, params=(schema_name, table_name)) + data_sample_T["Column Name"] = data_sample_T.index + data_sample_T.rename(columns= {0: "Sample"}, inplace=True) + table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name}) try: table_comment = table_comments['description'][0] except KeyError: table_comment = '' - #row count - if row_count_on: - row_count = pd.read_sql(rowcount_sql, con, params=(schema_name, table_name)) + #approx row count + rowcount_sql = sql.text(f''' + SELECT TO_CHAR(COUNT(1) * 100, '999,999,999,999,999') AS c FROM {schema_name}.{table_name} TABLESAMPLE SYSTEM (1); + ''') + row_count = pd.read_sql_query(rowcount_sql, con) #merge sample with column types, comments - final = column_types.merge(data_sample_T, on = 'column_name') - final = column_comments.merge(final, on = 'column_name', how='right') + final = column_types.merge(data_sample_T, on = 'Column Name') #reorder columns final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments']) #replace nans @@ -142,12 +161,13 @@ def custom_dataset(dataset, req_cols): final_formatted = final.to_markdown(index = False, tablefmt="github") #print for debugging #print(final_formatted) - #write formatted output with table name as header + #write formatted output with table name as header + object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0] + with open(fname, "a") as file: #append - file.write("### `{}.{}`\n".format(schema_name, table_name)) - file.write(f"{table_comment}\n\n") - if(row_count_on): - file.write("Row count: {:,}\n".format(row_count['count'][0])) + file.write(f"### `{schema_name}.{table_name}` ({object_type})\n") + file.write(f"{table_comment}\n") + file.write(f"Approx row count: {row_count['c'][0]}\n") file.write(final_formatted + "\n\n") -print(f"File path of output: {fname}") +print(f"File path of output: {fname}") \ No newline at end of file From 722f21dd5310a960f058b0a831708908a1b8bb54 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 7 Apr 2025 19:06:38 +0000 Subject: [PATCH 08/10] #21 fix column header mismatch --- bigdata_schema_readmes/generate_sql_readme.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index fd7b754..4e499b1 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -77,7 +77,7 @@ SELECT a.attname AS "Column Name", d.description AS "Comments", - pg_catalog.format_type(a.atttypid, a.atttypmod) as "Data type" + pg_catalog.format_type(a.atttypid, a.atttypmod) as "Data Type" FROM pg_class AS c JOIN pg_attribute AS a ON c.oid = a.attrelid JOIN pg_namespace AS n ON n.oid = c.relnamespace @@ -159,11 +159,9 @@ def custom_dataset(dataset, req_cols): final.fillna('', inplace=True) #markdown format for github final_formatted = final.to_markdown(index = False, tablefmt="github") - #print for debugging - #print(final_formatted) - #write formatted output with table name as header object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0] + #write formatted output with table name as header with open(fname, "a") as file: #append file.write(f"### `{schema_name}.{table_name}` ({object_type})\n") file.write(f"{table_comment}\n") From 9676fffb44845a3a2310c47d41120328d1bae330 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 8 Apr 2025 13:58:10 +0000 Subject: [PATCH 09/10] #21 add cli --- bigdata_schema_readmes/generate_sql_readme.py | 183 +++++++++--------- 1 file changed, 95 insertions(+), 88 deletions(-) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index 4e499b1..e3f366f 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -3,6 +3,7 @@ import configparser import sqlalchemy from sqlalchemy import sql +import click home_dir = os.path.expanduser('~') @@ -24,38 +25,27 @@ ) engine = sqlalchemy.create_engine(url_object) -###################### -##schema name goes here -###################### -input_schema = input("Input schema name to generate schema readme for:") - -# Parse schema and prefix from input -try: - schema_name, table_prefix = input_schema.split('.') -except ValueError: - schema_name = input_schema - -#find table names from information_schema.tables -if table_prefix: - table_sql = sql.text(''' - SELECT - c.relname AS table_name, - CASE c.relkind - WHEN 'p' THEN 'partitioned table' - WHEN 'r' THEN 'table' - END AS table_type - FROM pg_catalog.pg_class AS c - JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace - WHERE - n.nspname = :schema - AND c.relname LIKE :prefix - -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html - AND c.relkind = ANY('{p,r}') - AND NOT c.relispartition --exclude child partitions - ORDER BY 1, 2; - ''') -else: - table_sql = sql.text(''' +#find table names to iterate over +def table_sql(table_prefix = None): + if table_prefix is not None: + return sql.text(''' + SELECT + c.relname AS table_name, + CASE c.relkind + WHEN 'p' THEN 'partitioned table' + WHEN 'r' THEN 'table' + END AS table_type + FROM pg_catalog.pg_class AS c + JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace + WHERE + n.nspname = :schema + AND c.relname LIKE :prefix + -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html + AND c.relkind = ANY('{p,r}') + AND NOT c.relispartition --exclude child partitions + ORDER BY 1, 2; + ''') + return sql.text(''' SELECT c.relname AS table_name, CASE c.relkind @@ -66,7 +56,7 @@ JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace WHERE n.nspname = :schema - -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html + -- tables and partitioned tables AND c.relkind = ANY('{p,r}') AND NOT c.relispartition --exclude child partitions ORDER BY 1, 2; @@ -111,61 +101,78 @@ def custom_dataset(dataset, req_cols): in_.append(col) # append those that are in (i.e. valid) return dataset[in_] if in_ else None -#create directory if not exists -dir = home_dir + "/bigdata_schema_readmes" -if os.path.exists(dir) is False: - os.mkdir(dir) - print("Creating directory: {}".format(dir)) +def get_schema_readmes(schema_name, table_prefix): + #create directory if not exists + dir = home_dir + "/bigdata_schema_readmes" + if os.path.exists(dir) is False: + os.mkdir(dir) + print("Creating directory: {}".format(dir)) -#remove file if exists -fname = dir + "/{}_readme.txt".format(schema_name) -if os.path.isfile(fname): - os.remove(fname) + #remove file if exists + fname = dir + "/{}_readme.txt".format(schema_name) + if os.path.isfile(fname): + os.remove(fname) -with engine.connect() as con: - #identify tables within schema - if table_prefix: - tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name, 'prefix': f"{table_prefix}%"}) - else: - tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name}) - if tables.empty: - print("No tables found in schema '{}'".format(schema_name)) - #for each table - for table_name in tables['table_name']: - print(f"Processing {table_name}...") - #query columns & datatypes from information_schema - column_types = pd.read_sql_query(columns_sql, con, params={'schema': schema_name, 'table': table_name}) - #query sample row from schema.table and transpose - sample_query = sql.text(f"SELECT * FROM {schema_name}.{table_name} LIMIT 1") - data_sample = pd.read_sql_query(sample_query, con) - data_sample_T = data_sample.T - data_sample_T["Column Name"] = data_sample_T.index - data_sample_T.rename(columns= {0: "Sample"}, inplace=True) - table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name}) - try: - table_comment = table_comments['description'][0] - except KeyError: - table_comment = '' - #approx row count - rowcount_sql = sql.text(f''' - SELECT TO_CHAR(COUNT(1) * 100, '999,999,999,999,999') AS c FROM {schema_name}.{table_name} TABLESAMPLE SYSTEM (1); - ''') - row_count = pd.read_sql_query(rowcount_sql, con) - #merge sample with column types, comments - final = column_types.merge(data_sample_T, on = 'Column Name') - #reorder columns - final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments']) - #replace nans - final.fillna('', inplace=True) - #markdown format for github - final_formatted = final.to_markdown(index = False, tablefmt="github") - object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0] - - #write formatted output with table name as header - with open(fname, "a") as file: #append - file.write(f"### `{schema_name}.{table_name}` ({object_type})\n") - file.write(f"{table_comment}\n") - file.write(f"Approx row count: {row_count['c'][0]}\n") - file.write(final_formatted + "\n\n") + with engine.connect() as con: + #identify tables within schema + if table_prefix is not None: + tables = pd.read_sql_query(table_sql(table_prefix), con, params={'schema': schema_name, 'prefix': f"{table_prefix}%"}) + else: + tables = pd.read_sql_query(table_sql(), con, params={'schema': schema_name}) + if tables.empty: + print("No tables found in schema '{}'".format(schema_name)) + #for each table + for table_name in tables['table_name']: + print(f"Processing {table_name}...") + #query columns & datatypes from information_schema + column_types = pd.read_sql_query(columns_sql, con, params={'schema': schema_name, 'table': table_name}) + #query sample row from schema.table and transpose + sample_query = sql.text(f"SELECT * FROM {schema_name}.{table_name} LIMIT 1") + data_sample = pd.read_sql_query(sample_query, con) + data_sample_T = data_sample.T + data_sample_T["Column Name"] = data_sample_T.index + data_sample_T.rename(columns= {0: "Sample"}, inplace=True) + table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name}) + try: + table_comment = table_comments['description'][0] + except KeyError: + table_comment = '' + #approx row count + rowcount_sql = sql.text(f''' + SELECT TO_CHAR(COUNT(1) * 100, '999,999,999,999,999') AS c FROM {schema_name}.{table_name} TABLESAMPLE SYSTEM (1); + ''') + row_count = pd.read_sql_query(rowcount_sql, con) + #merge sample with column types, comments + final = column_types.merge(data_sample_T, on = 'Column Name') + #reorder columns + final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments']) + #replace nans + final.fillna('', inplace=True) + #markdown format for github + final_formatted = final.to_markdown(index = False, tablefmt="github") + object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0] + + #write formatted output with table name as header + with open(fname, "a") as file: #append + file.write(f"### `{schema_name}.{table_name}` ({object_type})\n") + file.write(f"{table_comment}\n") + file.write(f"Approx row count: {row_count['c'][0]}\n") + file.write(final_formatted + "\n\n") + + print(f"File path of output: {fname}") + +@click.command() +@click.option('--schema-name', '-s', type = str, required = True, help = 'Name of destination schema') +@click.option('--table-prefix', '-t', type = str, default=None, required = False, help = 'Optional table prefix') +def get_schema_readmes_cli(schema_name, table_prefix): + """ + This script generates readmes for a schema in bigdata. Optionally filter using `table-prefix` param. + + Example: + + python3 generate_sql_readme.py --schema-name bluetooth --table-prefix itsc + """ + get_schema_readmes(schema_name=schema_name, table_prefix=table_prefix) -print(f"File path of output: {fname}") \ No newline at end of file +if __name__ == '__main__': + get_schema_readmes_cli() \ No newline at end of file From 75979886d43178245b648e4e78adcf5980581479 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 8 Apr 2025 14:01:57 +0000 Subject: [PATCH 10/10] #21 truncate long columns --- bigdata_schema_readmes/generate_sql_readme.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py index e3f366f..66454cc 100644 --- a/bigdata_schema_readmes/generate_sql_readme.py +++ b/bigdata_schema_readmes/generate_sql_readme.py @@ -132,6 +132,7 @@ def get_schema_readmes(schema_name, table_prefix): data_sample_T = data_sample.T data_sample_T["Column Name"] = data_sample_T.index data_sample_T.rename(columns= {0: "Sample"}, inplace=True) + data_sample_T['Sample'] = data_sample_T['Sample'].apply(lambda x: str(x)[:80]) table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name}) try: table_comment = table_comments['description'][0]