From 9fc4cf4278db6e47046e817f976743477dc20aa1 Mon Sep 17 00:00:00 2001
From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com>
Date: Thu, 8 Jun 2023 14:58:44 -0400
Subject: [PATCH 01/10] initial commit for generate_sql_readme.py

---
 bigdata_schema_readmes/generate_sql_readme.py | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 bigdata_schema_readmes/generate_sql_readme.py

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
new file mode 100644
index 0000000..4c13321
--- /dev/null
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -0,0 +1,95 @@
+import pandas
+import os.path
+from pathlib import Path
+import configparser
+from psycopg2 import connect
+
+CONFIG = configparser.ConfigParser()
+CONFIG.read(str(Path.home().joinpath('db.cfg'))) #Creates a path to your db.cfg file
+dbset = CONFIG['DBSETTINGS']
+con = connect(**dbset)
+
+######################
+##schema name goes here
+######################
+schema_name = 'rescu'
+
+#find table names from information_schema.tables
+table_sql = '''
+SELECT table_name 
+FROM information_schema.tables
+WHERE table_schema = '{}'
+    AND table_type <> 'VIEW';
+'''
+
+#find column names and types from information_schema.columns
+columns_sql = '''
+SELECT column_name, data_type
+FROM information_schema.columns
+WHERE table_schema = '{}' 
+    AND table_name = '{}';
+'''
+
+#first row of table as sample
+sample_sql = '''
+SELECT * 
+FROM {}.{}
+LIMIT 1;
+'''
+
+#first row of table as sample
+rowcount_sql = '''
+SELECT COUNT(1)
+FROM {}.{};
+'''
+
+#create directory if not exists 
+#home folder
+dir = "bigdata_schema_readmes"
+if os.path.exists(dir) is False:
+    os.mkdir(dir)
+    print("Creating directory: {}".format(dir))
+
+#remove file if exists
+fname = dir + "/{}_readme.txt".format(schema_name)
+if os.path.isfile(fname):
+    os.remove(fname)
+
+with con:
+    #identify tables within schema
+    tables = pandas.read_sql(table_sql.format(schema_name), con)
+    
+    if tables.empty:
+        print("No tables found in schema '{}'".format(schema_name))
+
+    #for each table
+    for table_name in tables['table_name']:        
+
+        #query columns & datatypes from information_schema
+        column_types = pandas.read_sql(columns_sql.format(schema_name, table_name), con)
+
+        #query sample row from schema.table and transpose 
+        data_sample = pandas.read_sql(sample_sql.format(schema_name, table_name), con)       
+        data_sample_T = data_sample.T
+        data_sample_T["column_name"] = data_sample_T.index
+        data_sample_T.rename(columns= {0: "sample"}, inplace=True)
+
+        #row count        
+        row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con)
+
+        #merge sample with column types
+        final = column_types.merge(data_sample_T, on = 'column_name')
+        final['Comments'] = '' #blank column for comments
+
+        #markdown format for github
+        final_formatted = final.to_markdown(index = False)
+        
+        #print for debugging
+        print(final_formatted)                   
+
+        #write formatted output with table name as header        
+        with open(fname, "a") as file: #append
+            file.write("{}.{}\n".format(schema_name, table_name) + 
+                       "Row count: {:,}\n".format(row_count['count'][0]) +
+                       final_formatted + 
+                       "\n\n")

From f1abfde5735fd09883082f90b2e6b8836a982c1b Mon Sep 17 00:00:00 2001
From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com>
Date: Thu, 8 Jun 2023 15:14:55 -0400
Subject: [PATCH 02/10] Added column comments

---
 bigdata_schema_readmes/generate_sql_readme.py | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index 4c13321..6824094 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -12,7 +12,7 @@
 ######################
 ##schema name goes here
 ######################
-schema_name = 'rescu'
+schema_name = 'wys'
 
 #find table names from information_schema.tables
 table_sql = '''
@@ -43,6 +43,23 @@
 FROM {}.{};
 '''
 
+column_comments_sql = '''
+    SELECT
+        c.column_name,
+        pgd.description
+    FROM pg_catalog.pg_statio_all_tables AS st
+    INNER JOIN pg_catalog.pg_description AS pgd ON (
+        pgd.objoid = st.relid
+    )
+    INNER JOIN information_schema.columns AS c ON (
+        pgd.objsubid = c.ordinal_position 
+        AND c.table_schema = st.schemaname
+        AND c.table_name = st.relname
+    )
+    WHERE c.table_schema = '{}' 
+        AND c.table_name = '{}';
+'''
+
 #create directory if not exists 
 #home folder
 dir = "bigdata_schema_readmes"
@@ -77,9 +94,13 @@
         #row count        
         row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con)
 
+        #column comments
+        column_comments = pandas.read_sql(column_comments_sql.format(schema_name, table_name), con)
+        
         #merge sample with column types
-        final = column_types.merge(data_sample_T, on = 'column_name')
-        final['Comments'] = '' #blank column for comments
+        final = column_types.merge(data_sample_T, how = 'left', on = 'column_name')
+        final = final.merge(column_comments, how = 'left', on = 'column_name')
+        final['description'] = final['description'].fillna('')
 
         #markdown format for github
         final_formatted = final.to_markdown(index = False)

From 5108263bf7e9375db10a0b2a32b4bd6b4ab16a32 Mon Sep 17 00:00:00 2001
From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com>
Date: Thu, 8 Jun 2023 15:58:53 -0400
Subject: [PATCH 03/10] added column comments

---
 bigdata_schema_readmes/generate_sql_readme.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index 6824094..da64906 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -12,7 +12,8 @@
 ######################
 ##schema name goes here
 ######################
-schema_name = 'wys'
+schema_name = 'miovision_api'
+row_count_on = True #change to false to omit row counts (can be very slow on certain schemas)
 
 #find table names from information_schema.tables
 table_sql = '''
@@ -91,10 +92,11 @@
         data_sample_T["column_name"] = data_sample_T.index
         data_sample_T.rename(columns= {0: "sample"}, inplace=True)
 
-        #row count        
-        row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con)
+        #row count 
+        if row_count_on: 
+            row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con)
 
-        #column comments
+        #column comments --tested with miovision_api (has 3 column comments)
         column_comments = pandas.read_sql(column_comments_sql.format(schema_name, table_name), con)
         
         #merge sample with column types
@@ -110,7 +112,7 @@
 
         #write formatted output with table name as header        
         with open(fname, "a") as file: #append
-            file.write("{}.{}\n".format(schema_name, table_name) + 
-                       "Row count: {:,}\n".format(row_count['count'][0]) +
-                       final_formatted + 
-                       "\n\n")
+            file.write("{}.{}\n".format(schema_name, table_name))
+            if(row_count_on): 
+                file.write("Row count: {:,}\n".format(row_count['count'][0]))
+            file.write(final_formatted + "\n\n")
\ No newline at end of file

From d5fc506c2a1f57d91b99b75cef4b76fa479f1b9a Mon Sep 17 00:00:00 2001
From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com>
Date: Wed, 14 Jun 2023 16:25:08 -0400
Subject: [PATCH 04/10] Fix path for use in non-home dir. Add user input.

---
 bigdata_schema_readmes/generate_sql_readme.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index da64906..72f0f8c 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -4,16 +4,20 @@
 import configparser
 from psycopg2 import connect
 
+home_dir = os.path.expanduser('~')
+    
 CONFIG = configparser.ConfigParser()
-CONFIG.read(str(Path.home().joinpath('db.cfg'))) #Creates a path to your db.cfg file
+CONFIG.read(os.path.join(home_dir, 'db.cfg')) #Creates a path to your db.cfg file
 dbset = CONFIG['DBSETTINGS']
 con = connect(**dbset)
 
 ######################
 ##schema name goes here
 ######################
-schema_name = 'miovision_api'
-row_count_on = True #change to false to omit row counts (can be very slow on certain schemas)
+schema_name = input("Input schema name to generate schema readme for:")  
+#schema_name = 'rescu'
+row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.")
+#row_count_on = True #change to false to omit row counts (can be very slow on certain schemas)
 
 #find table names from information_schema.tables
 table_sql = '''
@@ -63,7 +67,7 @@
 
 #create directory if not exists 
 #home folder
-dir = "bigdata_schema_readmes"
+dir = home_dir + "/bigdata_schema_readmes"
 if os.path.exists(dir) is False:
     os.mkdir(dir)
     print("Creating directory: {}".format(dir))
@@ -73,6 +77,8 @@
 if os.path.isfile(fname):
     os.remove(fname)
 
+print("Destination path: " + fname)
+    
 with con:
     #identify tables within schema
     tables = pandas.read_sql(table_sql.format(schema_name), con)

From 630f0173f33ee1734f2319ea0c16e3bb3822defc Mon Sep 17 00:00:00 2001
From: Gabe Wolofsky <80077912+gabrielwol@users.noreply.github.com>
Date: Mon, 19 Jun 2023 17:34:17 -0400
Subject: [PATCH 05/10] Fix issues with SQL string composition

---
 bigdata_schema_readmes/generate_sql_readme.py | 68 +++++++++++--------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index 72f0f8c..4cd56d4 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -1,8 +1,8 @@
-import pandas
+import pandas as pd
 import os.path
-from pathlib import Path
 import configparser
 from psycopg2 import connect
+from psycopg2 import sql
 
 home_dir = os.path.expanduser('~')
     
@@ -14,41 +14,41 @@
 ######################
 ##schema name goes here
 ######################
-schema_name = input("Input schema name to generate schema readme for:")  
+schema_name = input("Input schema name to generate schema readme for:") 
 #schema_name = 'rescu'
 row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.")
 #row_count_on = True #change to false to omit row counts (can be very slow on certain schemas)
 
 #find table names from information_schema.tables
-table_sql = '''
+table_sql = sql.SQL('''
 SELECT table_name 
 FROM information_schema.tables
-WHERE table_schema = '{}'
+WHERE table_schema = {schema}
     AND table_type <> 'VIEW';
-'''
+''')
 
 #find column names and types from information_schema.columns
-columns_sql = '''
+columns_sql = sql.SQL('''
 SELECT column_name, data_type
 FROM information_schema.columns
-WHERE table_schema = '{}' 
-    AND table_name = '{}';
-'''
+WHERE table_schema = {schema} 
+    AND table_name = {table};
+''')
 
 #first row of table as sample
-sample_sql = '''
+sample_sql = sql.SQL('''
 SELECT * 
-FROM {}.{}
+FROM {schema}.{table}
 LIMIT 1;
-'''
+''')
 
 #first row of table as sample
-rowcount_sql = '''
+rowcount_sql = sql.SQL('''
 SELECT COUNT(1)
-FROM {}.{};
-'''
+FROM {schema}.{table};
+''')
 
-column_comments_sql = '''
+column_comments_sql = sql.SQL('''
     SELECT
         c.column_name,
         pgd.description
@@ -61,9 +61,9 @@
         AND c.table_schema = st.schemaname
         AND c.table_name = st.relname
     )
-    WHERE c.table_schema = '{}' 
-        AND c.table_name = '{}';
-'''
+    WHERE c.table_schema = {schema} 
+        AND c.table_name = {table};
+''')
 
 #create directory if not exists 
 #home folder
@@ -80,30 +80,40 @@
 print("Destination path: " + fname)
     
 with con:
-    #identify tables within schema
-    tables = pandas.read_sql(table_sql.format(schema_name), con)
     
+    #identify tables within schema
+    tables = pd.read_sql_query(table_sql.format(
+        schema = sql.Literal(schema_name)), con)
+
     if tables.empty:
-        print("No tables found in schema '{}'".format(schema_name))
+        print(f"No tables found in schema '{schema_name}'")
 
     #for each table
     for table_name in tables['table_name']:        
 
         #query columns & datatypes from information_schema
-        column_types = pandas.read_sql(columns_sql.format(schema_name, table_name), con)
+        column_types = pd.read_sql(columns_sql.format(
+            schema = sql.Literal(schema_name), 
+            table = sql.Literal(table_name)), con)
 
         #query sample row from schema.table and transpose 
-        data_sample = pandas.read_sql(sample_sql.format(schema_name, table_name), con)       
+        data_sample = pd.read_sql(sample_sql.format(
+            schema = sql.Identifier(schema_name),
+            table = sql.Identifier(table_name)), con)       
         data_sample_T = data_sample.T
         data_sample_T["column_name"] = data_sample_T.index
         data_sample_T.rename(columns= {0: "sample"}, inplace=True)
 
         #row count 
         if row_count_on: 
-            row_count = pandas.read_sql(rowcount_sql.format(schema_name, table_name), con)
+            row_count = pd.read_sql(rowcount_sql.format(
+                schema = sql.Identifier(schema_name),
+                table = sql.Identifier(table_name)), con)
 
         #column comments --tested with miovision_api (has 3 column comments)
-        column_comments = pandas.read_sql(column_comments_sql.format(schema_name, table_name), con)
+        column_comments = pd.read_sql(column_comments_sql.format(
+            schema = sql.Literal(schema_name), 
+            table = sql.Literal(table_name)), con)
         
         #merge sample with column types
         final = column_types.merge(data_sample_T, how = 'left', on = 'column_name')
@@ -121,4 +131,6 @@
             file.write("{}.{}\n".format(schema_name, table_name))
             if(row_count_on): 
                 file.write("Row count: {:,}\n".format(row_count['count'][0]))
-            file.write(final_formatted + "\n\n")
\ No newline at end of file
+            file.write(final_formatted + "\n\n")
+
+print(f"File path of output: {fname}")
\ No newline at end of file

From e992384d407875f7190569eb815d67fdbb1f2af5 Mon Sep 17 00:00:00 2001
From: gabrielwol <80077912+gabrielwol@users.noreply.github.com>
Date: Mon, 13 May 2024 17:24:17 -0400
Subject: [PATCH 06/10] updates to generate_sql_readme

---
 bigdata_schema_readmes/generate_sql_readme.py | 173 ++++++++++--------
 1 file changed, 95 insertions(+), 78 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index 4cd56d4..44d90a7 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -1,72 +1,102 @@
 import pandas as pd
 import os.path
 import configparser
-from psycopg2 import connect
-from psycopg2 import sql
+import sqlalchemy
 
 home_dir = os.path.expanduser('~')
-    
+
 CONFIG = configparser.ConfigParser()
 CONFIG.read(os.path.join(home_dir, 'db.cfg')) #Creates a path to your db.cfg file
-dbset = CONFIG['DBSETTINGS']
-con = connect(**dbset)
+
+"""
+sqlalchemy cfg format:
+[SQLALCHEMY]
+host=
+database=
+username=
+password=
+"""
+dbset = CONFIG['SQLALCHEMY']
+url_object = sqlalchemy.engine.URL.create(
+    "postgresql+psycopg2",
+    **dbset
+)
+engine = sqlalchemy.create_engine(url_object)
 
 ######################
 ##schema name goes here
 ######################
 schema_name = input("Input schema name to generate schema readme for:") 
-#schema_name = 'rescu'
+#schema_name = 'ecocounter'
 row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.")
 #row_count_on = True #change to false to omit row counts (can be very slow on certain schemas)
 
 #find table names from information_schema.tables
-table_sql = sql.SQL('''
+table_sql = '''
 SELECT table_name 
 FROM information_schema.tables
-WHERE table_schema = {schema}
+WHERE table_schema = '{}'
     AND table_type <> 'VIEW';
-''')
+'''
 
 #find column names and types from information_schema.columns
-columns_sql = sql.SQL('''
+columns_sql = '''
 SELECT column_name, data_type
 FROM information_schema.columns
-WHERE table_schema = {schema} 
-    AND table_name = {table};
-''')
+WHERE table_schema = '{}' 
+    AND table_name = '{}';
+'''
+
+column_comments_sql = '''
+SELECT
+    a.attname AS column_name, 
+    d.description AS "Comments"
+FROM pg_class AS c
+JOIN pg_attribute AS a ON c.oid = a.attrelid
+JOIN pg_namespace AS n ON n.oid = c.relnamespace
+JOIN pg_description AS d ON
+    d.objoid = c.oid
+    AND d.objsubid = a.attnum
+WHERE
+    n.nspname = '{}'
+    AND c.relname = '{}'
+    AND d.description IS NOT NULL;
+'''
+
+table_comments_sql = '''
+SELECT pgd.description
+FROM pg_description AS pgd
+JOIN pg_class AS pgc ON pgd.objoid = pgc.oid
+JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid
+WHERE
+    pgn.nspname = '{}'
+    AND pgc.relname = '{}'
+'''
 
 #first row of table as sample
-sample_sql = sql.SQL('''
+sample_sql = '''
 SELECT * 
-FROM {schema}.{table}
+FROM {}.{}
 LIMIT 1;
-''')
+'''
 
-#first row of table as sample
-rowcount_sql = sql.SQL('''
+#rowcount 
+rowcount_sql = '''
 SELECT COUNT(1)
-FROM {schema}.{table};
-''')
-
-column_comments_sql = sql.SQL('''
-    SELECT
-        c.column_name,
-        pgd.description
-    FROM pg_catalog.pg_statio_all_tables AS st
-    INNER JOIN pg_catalog.pg_description AS pgd ON (
-        pgd.objoid = st.relid
-    )
-    INNER JOIN information_schema.columns AS c ON (
-        pgd.objsubid = c.ordinal_position 
-        AND c.table_schema = st.schemaname
-        AND c.table_name = st.relname
-    )
-    WHERE c.table_schema = {schema} 
-        AND c.table_name = {table};
-''')
+FROM {}.{};
+'''
+
+#Don't fail if some columns are not in the dataset.
+#Source: https://stackoverflow.com/a/62658311
+def custom_dataset(dataset, req_cols):
+    in_ = []
+    if isinstance(dataset, pd.DataFrame):  # optional
+        for col in req_cols:  # check for every existing column
+            if col in dataset.columns:
+                in_.append(col)  # append those that are in (i.e. valid)
+    return dataset[in_] if in_ else None
 
 #create directory if not exists 
-#home folder
 dir = home_dir + "/bigdata_schema_readmes"
 if os.path.exists(dir) is False:
     os.mkdir(dir)
@@ -77,60 +107,47 @@
 if os.path.isfile(fname):
     os.remove(fname)
 
-print("Destination path: " + fname)
-    
-with con:
-    
+with engine.connect() as con:
     #identify tables within schema
-    tables = pd.read_sql_query(table_sql.format(
-        schema = sql.Literal(schema_name)), con)
-
+    tables = pd.read_sql(table_sql, con, params=(schema_name,))
     if tables.empty:
-        print(f"No tables found in schema '{schema_name}'")
-
+        print("No tables found in schema '{}'".format(schema_name))
     #for each table
-    for table_name in tables['table_name']:        
-
+    for table_name in tables['table_name']: 
+        print(table_name)
         #query columns & datatypes from information_schema
-        column_types = pd.read_sql(columns_sql.format(
-            schema = sql.Literal(schema_name), 
-            table = sql.Literal(table_name)), con)
-
+        column_types = pd.read_sql(columns_sql, con, params=(schema_name, table_name))
+        column_comments = pd.read_sql(column_comments_sql, con, params=(schema_name, table_name))
         #query sample row from schema.table and transpose 
-        data_sample = pd.read_sql(sample_sql.format(
-            schema = sql.Identifier(schema_name),
-            table = sql.Identifier(table_name)), con)       
+        data_sample = pd.read_sql(sample_sql, con, params=(schema_name, table_name))
         data_sample_T = data_sample.T
         data_sample_T["column_name"] = data_sample_T.index
-        data_sample_T.rename(columns= {0: "sample"}, inplace=True)
-
+        data_sample_T.rename(columns= {0: "sample"}, inplace=True)        
+        table_comments = pd.read_sql(table_comments_sql, con, params=(schema_name, table_name))
+        try:
+            table_comment = table_comments['description'][0]
+        except KeyError:
+            table_comment = ''
         #row count 
         if row_count_on: 
-            row_count = pd.read_sql(rowcount_sql.format(
-                schema = sql.Identifier(schema_name),
-                table = sql.Identifier(table_name)), con)
-
-        #column comments --tested with miovision_api (has 3 column comments)
-        column_comments = pd.read_sql(column_comments_sql.format(
-            schema = sql.Literal(schema_name), 
-            table = sql.Literal(table_name)), con)
-        
-        #merge sample with column types
-        final = column_types.merge(data_sample_T, how = 'left', on = 'column_name')
-        final = final.merge(column_comments, how = 'left', on = 'column_name')
-        final['description'] = final['description'].fillna('')
-
+            row_count = pd.read_sql(rowcount_sql, con, params=(schema_name, table_name))
+        #merge sample with column types, comments
+        final = column_types.merge(data_sample_T, on = 'column_name')
+        final = column_comments.merge(final, on = 'column_name', how='right')
+        #reorder columns
+        final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments'])
+        #replace nans
+        final.fillna('', inplace=True)
         #markdown format for github
-        final_formatted = final.to_markdown(index = False)
-        
+        final_formatted = final.to_markdown(index = False, tablefmt="github")        
         #print for debugging
-        print(final_formatted)                   
-
+        #print(final_formatted)        
         #write formatted output with table name as header        
         with open(fname, "a") as file: #append
-            file.write("{}.{}\n".format(schema_name, table_name))
+            file.write("### `{}.{}`\n".format(schema_name, table_name))
+            file.write(f"{table_comment}\n\n")
             if(row_count_on): 
                 file.write("Row count: {:,}\n".format(row_count['count'][0]))
             file.write(final_formatted + "\n\n")
 
-print(f"File path of output: {fname}")
\ No newline at end of file
+print(f"File path of output: {fname}")

From 1a3fa6c0b40b2985e52eb50e0a0b869316a54843 Mon Sep 17 00:00:00 2001
From: gabrielwol <80077912+gabrielwol@users.noreply.github.com>
Date: Mon, 7 Apr 2025 18:59:40 +0000
Subject: [PATCH 07/10] #21 fix up sqlalchemy working version; approx row
 counts, add object type, prefix filtering

---
 bigdata_schema_readmes/generate_sql_readme.py | 144 ++++++++++--------
 1 file changed, 82 insertions(+), 62 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index 44d90a7..fd7b754 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -2,6 +2,7 @@
 import os.path
 import configparser
 import sqlalchemy
+from sqlalchemy import sql
 
 home_dir = os.path.expanduser('~')
 
@@ -26,65 +27,79 @@
 ######################
 ##schema name goes here
 ######################
-schema_name = input("Input schema name to generate schema readme for:") 
-#schema_name = 'ecocounter'
-row_count_on = input("Row count on? (True/False) Can be slow for certain schemas.")
-#row_count_on = True #change to false to omit row counts (can be very slow on certain schemas)
+input_schema = input("Input schema name to generate schema readme for:") 
+
+# Parse schema and prefix from input
+try:
+    schema_name, table_prefix = input_schema.split('.')
+except ValueError:
+    schema_name = input_schema
 
 #find table names from information_schema.tables
-table_sql = '''
-SELECT table_name 
-FROM information_schema.tables
-WHERE table_schema = '{}'
-    AND table_type <> 'VIEW';
-'''
+if table_prefix:
+    table_sql = sql.text('''
+    SELECT
+        c.relname AS table_name,
+        CASE c.relkind
+            WHEN 'p' THEN 'partitioned table'
+            WHEN 'r' THEN 'table'
+        END AS table_type
+    FROM pg_catalog.pg_class AS c
+    JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace
+    WHERE
+        n.nspname = :schema
+        AND c.relname LIKE :prefix
+        -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html
+        AND c.relkind = ANY('{p,r}')
+    AND NOT c.relispartition --exclude child partitions
+    ORDER BY 1, 2;
+    ''')
+else:
+    table_sql = sql.text('''
+    SELECT
+        c.relname AS table_name,
+        CASE c.relkind
+            WHEN 'p' THEN 'partitioned table'
+            WHEN 'r' THEN 'table'
+        END AS table_type
+    FROM pg_catalog.pg_class AS c
+    JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace
+    WHERE
+        n.nspname = :schema
+        -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html
+        AND c.relkind = ANY('{p,r}')
+    AND NOT c.relispartition --exclude child partitions
+    ORDER BY 1, 2;
+    ''')
 
 #find column names and types from information_schema.columns
-columns_sql = '''
-SELECT column_name, data_type
-FROM information_schema.columns
-WHERE table_schema = '{}' 
-    AND table_name = '{}';
-'''
-
-column_comments_sql = '''
+columns_sql = sql.text('''
 SELECT
-    a.attname AS column_name, 
-    d.description AS "Comments"
+    a.attname AS "Column Name", 
+    d.description AS "Comments",
+    pg_catalog.format_type(a.atttypid, a.atttypmod) as "Data type"
 FROM pg_class AS c
 JOIN pg_attribute AS a ON c.oid = a.attrelid
 JOIN pg_namespace AS n ON n.oid = c.relnamespace
-JOIN pg_description AS d ON
+LEFT JOIN pg_description AS d ON
     d.objoid = c.oid
     AND d.objsubid = a.attnum
 WHERE
-    n.nspname = '{}'
-    AND c.relname = '{}'
-    AND d.description IS NOT NULL;
-'''
+    n.nspname = :schema
+    AND c.relname = :table
+    AND attisdropped = false
+    AND attnum >= 1;
+''')
 
-table_comments_sql = '''
+table_comments_sql = sql.text('''
 SELECT pgd.description
 FROM pg_description AS pgd
 JOIN pg_class AS pgc ON pgd.objoid = pgc.oid
 JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid
 WHERE
-    pgn.nspname = '{}'
-    AND pgc.relname = '{}'
-'''
-
-#first row of table as sample
-sample_sql = '''
-SELECT * 
-FROM {}.{}
-LIMIT 1;
-'''
-
-#rowcount 
-rowcount_sql = '''
-SELECT COUNT(1)
-FROM {}.{};
-'''
+    pgn.nspname = :schema
+    AND pgc.relname = :table
+''')
 
 #Don't fail if some columns are not in the dataset.
 #Source: https://stackoverflow.com/a/62658311
@@ -109,31 +124,35 @@ def custom_dataset(dataset, req_cols):
 
 with engine.connect() as con:
     #identify tables within schema
-    tables = pd.read_sql(table_sql, con, params=(schema_name,))
+    if table_prefix:
+        tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name, 'prefix': f"{table_prefix}%"})
+    else:
+        tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name})
     if tables.empty:
         print("No tables found in schema '{}'".format(schema_name))
     #for each table
     for table_name in tables['table_name']: 
-        print(table_name)
+        print(f"Processing {table_name}...")
         #query columns & datatypes from information_schema
-        column_types = pd.read_sql(columns_sql, con, params=(schema_name, table_name))
-        column_comments = pd.read_sql(column_comments_sql, con, params=(schema_name, table_name))
-        #query sample row from schema.table and transpose 
-        data_sample = pd.read_sql(sample_sql, con, params=(schema_name, table_name))
+        column_types = pd.read_sql_query(columns_sql, con, params={'schema': schema_name, 'table': table_name})
+        #query sample row from schema.table and transpose
+        sample_query = sql.text(f"SELECT * FROM {schema_name}.{table_name} LIMIT 1")
+        data_sample = pd.read_sql_query(sample_query, con)
         data_sample_T = data_sample.T
-        data_sample_T["column_name"] = data_sample_T.index
-        data_sample_T.rename(columns= {0: "sample"}, inplace=True)        
-        table_comments = pd.read_sql(table_comments_sql, con, params=(schema_name, table_name))
+        data_sample_T["Column Name"] = data_sample_T.index
+        data_sample_T.rename(columns= {0: "Sample"}, inplace=True)
+        table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name})
         try:
             table_comment = table_comments['description'][0]
         except KeyError:
             table_comment = ''
-        #row count 
-        if row_count_on: 
-            row_count = pd.read_sql(rowcount_sql, con, params=(schema_name, table_name))
+        #approx row count
+        rowcount_sql = sql.text(f'''
+        SELECT TO_CHAR(COUNT(1) * 100, '999,999,999,999,999') AS c FROM {schema_name}.{table_name} TABLESAMPLE SYSTEM (1);
+        ''')
+        row_count = pd.read_sql_query(rowcount_sql, con)
         #merge sample with column types, comments
-        final = column_types.merge(data_sample_T, on = 'column_name')
-        final = column_comments.merge(final, on = 'column_name', how='right')
+        final = column_types.merge(data_sample_T, on = 'Column Name')
         #reorder columns
         final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments'])
         #replace nans
@@ -142,12 +161,13 @@ def custom_dataset(dataset, req_cols):
         final_formatted = final.to_markdown(index = False, tablefmt="github")        
         #print for debugging
         #print(final_formatted)        
-        #write formatted output with table name as header        
+        #write formatted output with table name as header
+        object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0]
+        
         with open(fname, "a") as file: #append
-            file.write("### `{}.{}`\n".format(schema_name, table_name))
-            file.write(f"{table_comment}\n\n")
-            if(row_count_on): 
-                file.write("Row count: {:,}\n".format(row_count['count'][0]))
+            file.write(f"### `{schema_name}.{table_name}` ({object_type})\n")
+            file.write(f"{table_comment}\n")
+            file.write(f"Approx row count: {row_count['c'][0]}\n")
             file.write(final_formatted + "\n\n")
 
-print(f"File path of output: {fname}")
+print(f"File path of output: {fname}")
\ No newline at end of file

From 722f21dd5310a960f058b0a831708908a1b8bb54 Mon Sep 17 00:00:00 2001
From: gabrielwol <80077912+gabrielwol@users.noreply.github.com>
Date: Mon, 7 Apr 2025 19:06:38 +0000
Subject: [PATCH 08/10] #21 fix column header mismatch

---
 bigdata_schema_readmes/generate_sql_readme.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index fd7b754..4e499b1 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -77,7 +77,7 @@
 SELECT
     a.attname AS "Column Name", 
     d.description AS "Comments",
-    pg_catalog.format_type(a.atttypid, a.atttypmod) as "Data type"
+    pg_catalog.format_type(a.atttypid, a.atttypmod) as "Data Type"
 FROM pg_class AS c
 JOIN pg_attribute AS a ON c.oid = a.attrelid
 JOIN pg_namespace AS n ON n.oid = c.relnamespace
@@ -159,11 +159,9 @@ def custom_dataset(dataset, req_cols):
         final.fillna('', inplace=True)
         #markdown format for github
         final_formatted = final.to_markdown(index = False, tablefmt="github")        
-        #print for debugging
-        #print(final_formatted)        
-        #write formatted output with table name as header
         object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0]
         
+        #write formatted output with table name as header
         with open(fname, "a") as file: #append
             file.write(f"### `{schema_name}.{table_name}` ({object_type})\n")
             file.write(f"{table_comment}\n")

From 9676fffb44845a3a2310c47d41120328d1bae330 Mon Sep 17 00:00:00 2001
From: gabrielwol <80077912+gabrielwol@users.noreply.github.com>
Date: Tue, 8 Apr 2025 13:58:10 +0000
Subject: [PATCH 09/10] #21 add cli

---
 bigdata_schema_readmes/generate_sql_readme.py | 183 +++++++++---------
 1 file changed, 95 insertions(+), 88 deletions(-)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index 4e499b1..e3f366f 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -3,6 +3,7 @@
 import configparser
 import sqlalchemy
 from sqlalchemy import sql
+import click
 
 home_dir = os.path.expanduser('~')
 
@@ -24,38 +25,27 @@
 )
 engine = sqlalchemy.create_engine(url_object)
 
-######################
-##schema name goes here
-######################
-input_schema = input("Input schema name to generate schema readme for:") 
-
-# Parse schema and prefix from input
-try:
-    schema_name, table_prefix = input_schema.split('.')
-except ValueError:
-    schema_name = input_schema
-
-#find table names from information_schema.tables
-if table_prefix:
-    table_sql = sql.text('''
-    SELECT
-        c.relname AS table_name,
-        CASE c.relkind
-            WHEN 'p' THEN 'partitioned table'
-            WHEN 'r' THEN 'table'
-        END AS table_type
-    FROM pg_catalog.pg_class AS c
-    JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace
-    WHERE
-        n.nspname = :schema
-        AND c.relname LIKE :prefix
-        -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html
-        AND c.relkind = ANY('{p,r}')
-    AND NOT c.relispartition --exclude child partitions
-    ORDER BY 1, 2;
-    ''')
-else:
-    table_sql = sql.text('''
+#find table names to iterate over
+def table_sql(table_prefix = None):
+    if table_prefix is not None:
+        return sql.text('''
+        SELECT
+            c.relname AS table_name,
+            CASE c.relkind
+                WHEN 'p' THEN 'partitioned table'
+                WHEN 'r' THEN 'table'
+            END AS table_type
+        FROM pg_catalog.pg_class AS c
+        JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace
+        WHERE
+            n.nspname = :schema
+            AND c.relname LIKE :prefix
+            -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html
+            AND c.relkind = ANY('{p,r}')
+        AND NOT c.relispartition --exclude child partitions
+        ORDER BY 1, 2;
+        ''')
+    return sql.text('''
     SELECT
         c.relname AS table_name,
         CASE c.relkind
@@ -66,7 +56,7 @@
     JOIN pg_catalog.pg_namespace AS n ON n.oid = c.relnamespace
     WHERE
         n.nspname = :schema
-        -- add more type: https://www.postgresql.org/docs/current/catalog-pg-class.html
+        -- tables and partitioned tables
         AND c.relkind = ANY('{p,r}')
     AND NOT c.relispartition --exclude child partitions
     ORDER BY 1, 2;
@@ -111,61 +101,78 @@ def custom_dataset(dataset, req_cols):
                 in_.append(col)  # append those that are in (i.e. valid)
     return dataset[in_] if in_ else None
 
-#create directory if not exists 
-dir = home_dir + "/bigdata_schema_readmes"
-if os.path.exists(dir) is False:
-    os.mkdir(dir)
-    print("Creating directory: {}".format(dir))
+def get_schema_readmes(schema_name, table_prefix):
+    #create directory if not exists 
+    dir = home_dir + "/bigdata_schema_readmes"
+    if os.path.exists(dir) is False:
+        os.mkdir(dir)
+        print("Creating directory: {}".format(dir))
 
-#remove file if exists
-fname = dir + "/{}_readme.txt".format(schema_name)
-if os.path.isfile(fname):
-    os.remove(fname)
+    #remove file if exists
+    fname = dir + "/{}_readme.txt".format(schema_name)
+    if os.path.isfile(fname):
+        os.remove(fname)
 
-with engine.connect() as con:
-    #identify tables within schema
-    if table_prefix:
-        tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name, 'prefix': f"{table_prefix}%"})
-    else:
-        tables = pd.read_sql_query(table_sql, con, params={'schema': schema_name})
-    if tables.empty:
-        print("No tables found in schema '{}'".format(schema_name))
-    #for each table
-    for table_name in tables['table_name']: 
-        print(f"Processing {table_name}...")
-        #query columns & datatypes from information_schema
-        column_types = pd.read_sql_query(columns_sql, con, params={'schema': schema_name, 'table': table_name})
-        #query sample row from schema.table and transpose
-        sample_query = sql.text(f"SELECT * FROM {schema_name}.{table_name} LIMIT 1")
-        data_sample = pd.read_sql_query(sample_query, con)
-        data_sample_T = data_sample.T
-        data_sample_T["Column Name"] = data_sample_T.index
-        data_sample_T.rename(columns= {0: "Sample"}, inplace=True)
-        table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name})
-        try:
-            table_comment = table_comments['description'][0]
-        except KeyError:
-            table_comment = ''
-        #approx row count
-        rowcount_sql = sql.text(f'''
-        SELECT TO_CHAR(COUNT(1) * 100, '999,999,999,999,999') AS c FROM {schema_name}.{table_name} TABLESAMPLE SYSTEM (1);
-        ''')
-        row_count = pd.read_sql_query(rowcount_sql, con)
-        #merge sample with column types, comments
-        final = column_types.merge(data_sample_T, on = 'Column Name')
-        #reorder columns
-        final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments'])
-        #replace nans
-        final.fillna('', inplace=True)
-        #markdown format for github
-        final_formatted = final.to_markdown(index = False, tablefmt="github")        
-        object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0]
-        
-        #write formatted output with table name as header
-        with open(fname, "a") as file: #append
-            file.write(f"### `{schema_name}.{table_name}` ({object_type})\n")
-            file.write(f"{table_comment}\n")
-            file.write(f"Approx row count: {row_count['c'][0]}\n")
-            file.write(final_formatted + "\n\n")
+    with engine.connect() as con:
+        #identify tables within schema
+        if table_prefix is not None:
+            tables = pd.read_sql_query(table_sql(table_prefix), con, params={'schema': schema_name, 'prefix': f"{table_prefix}%"})
+        else:
+            tables = pd.read_sql_query(table_sql(), con, params={'schema': schema_name})
+        if tables.empty:
+            print("No tables found in schema '{}'".format(schema_name))
+        #for each table
+        for table_name in tables['table_name']: 
+            print(f"Processing {table_name}...")
+            #query columns & datatypes from information_schema
+            column_types = pd.read_sql_query(columns_sql, con, params={'schema': schema_name, 'table': table_name})
+            #query sample row from schema.table and transpose
+            sample_query = sql.text(f"SELECT * FROM {schema_name}.{table_name} LIMIT 1")
+            data_sample = pd.read_sql_query(sample_query, con)
+            data_sample_T = data_sample.T
+            data_sample_T["Column Name"] = data_sample_T.index
+            data_sample_T.rename(columns= {0: "Sample"}, inplace=True)
+            table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name})
+            try:
+                table_comment = table_comments['description'][0]
+            except KeyError:
+                table_comment = ''
+            #approx row count
+            rowcount_sql = sql.text(f'''
+            SELECT TO_CHAR(COUNT(1) * 100, '999,999,999,999,999') AS c FROM {schema_name}.{table_name} TABLESAMPLE SYSTEM (1);
+            ''')
+            row_count = pd.read_sql_query(rowcount_sql, con)
+            #merge sample with column types, comments
+            final = column_types.merge(data_sample_T, on = 'Column Name')
+            #reorder columns
+            final=custom_dataset(final, ['Column Name', 'Data Type', 'Sample', 'Comments'])
+            #replace nans
+            final.fillna('', inplace=True)
+            #markdown format for github
+            final_formatted = final.to_markdown(index = False, tablefmt="github")        
+            object_type = tables.loc[tables.table_name == table_name, 'table_type'].iloc[0]
+            
+            #write formatted output with table name as header
+            with open(fname, "a") as file: #append
+                file.write(f"### `{schema_name}.{table_name}` ({object_type})\n")
+                file.write(f"{table_comment}\n")
+                file.write(f"Approx row count: {row_count['c'][0]}\n")
+                file.write(final_formatted + "\n\n")
+
+    print(f"File path of output: {fname}")
+
+@click.command()
+@click.option('--schema-name', '-s', type = str, required = True, help = 'Name of destination schema')
+@click.option('--table-prefix', '-t', type = str, default=None, required = False, help = 'Optional table prefix')
+def get_schema_readmes_cli(schema_name, table_prefix):
+    """
+    This script generates readmes for a schema in bigdata. Optionally filter using `table-prefix` param.
+    
+    Example:
+
+    python3 generate_sql_readme.py --schema-name bluetooth --table-prefix itsc
+    """
+    get_schema_readmes(schema_name=schema_name, table_prefix=table_prefix)
 
-print(f"File path of output: {fname}")
\ No newline at end of file
+if __name__ == '__main__':
+    get_schema_readmes_cli()
\ No newline at end of file

From 75979886d43178245b648e4e78adcf5980581479 Mon Sep 17 00:00:00 2001
From: gabrielwol <80077912+gabrielwol@users.noreply.github.com>
Date: Tue, 8 Apr 2025 14:01:57 +0000
Subject: [PATCH 10/10] #21 truncate long columns

---
 bigdata_schema_readmes/generate_sql_readme.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bigdata_schema_readmes/generate_sql_readme.py b/bigdata_schema_readmes/generate_sql_readme.py
index e3f366f..66454cc 100644
--- a/bigdata_schema_readmes/generate_sql_readme.py
+++ b/bigdata_schema_readmes/generate_sql_readme.py
@@ -132,6 +132,7 @@ def get_schema_readmes(schema_name, table_prefix):
             data_sample_T = data_sample.T
             data_sample_T["Column Name"] = data_sample_T.index
             data_sample_T.rename(columns= {0: "Sample"}, inplace=True)
+            data_sample_T['Sample'] = data_sample_T['Sample'].apply(lambda x: str(x)[:80])
             table_comments = pd.read_sql_query(table_comments_sql, con, params={'schema': schema_name, 'table': table_name})
             try:
                 table_comment = table_comments['description'][0]