From 288897fef908e05d3e70366f3a3d948319382723 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 4 Aug 2020 18:10:03 +0000 Subject: [PATCH 001/104] First pass at adding Statslib --- .gitignore | 3 ++ .init | 15 +++++++++ udfs/statslib/README.md | 53 ++++++++++++++++++++++++++++++++ udfs/statslib/kruskal_wallis.sql | 36 ++++++++++++++++++++++ udfs/statslib/pvalue.sql | 18 +++++++++++ udfs/statslib/test_cases.yaml | 4 +++ udfs/tests/udf_test_utils.py | 4 +++ 7 files changed, 133 insertions(+) create mode 100644 .init create mode 100644 udfs/statslib/README.md create mode 100644 udfs/statslib/kruskal_wallis.sql create mode 100644 udfs/statslib/pvalue.sql create mode 100644 udfs/statslib/test_cases.yaml diff --git a/.gitignore b/.gitignore index b9845688f..af74a00cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Compiled class file *.class +*.pyc # Log file *.log @@ -27,3 +28,5 @@ hs_err_pid* .vscode .idea/ target/ +tools/unsupervised_dataset/sql_crawler/__pycache__/__init__.cpython-38.pyc +key.json diff --git a/.init b/.init new file mode 100644 index 000000000..4910177dd --- /dev/null +++ b/.init @@ -0,0 +1,15 @@ + +# if lsst virutalenv exists, use it. + +if [ -f ~/.bigquery/bin/activate ]; then + . ~/.bigquery/bin/activate +fi + +if [ -f $(pwd)/key.json ]; then + export GOOGLE_APPLICATION_CREDENTIALS="$(pwd)/key.json" +fi + +gcloud config set project nih-bq +gcloud config set compute/zone us-central1-f +gcloud config set compute/region us-central1 + diff --git a/udfs/statslib/README.md b/udfs/statslib/README.md new file mode 100644 index 000000000..9f60eea47 --- /dev/null +++ b/udfs/statslib/README.md @@ -0,0 +1,53 @@ +# Statistical UDFs + +This directory contains community contributed [user-defined functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions) for Statistical Analysis +to extend BigQuery for more specialized usage patterns. Each UDF within this +directory will be automatically synchronized to the `bqutil` project within the +`fn` dataset for reference in queries. + +For example, if you'd like to reference the `int` function within your query, +you can reference it like the following: +```sql +SELECT bqutil.fn.int(1.684) +``` + +## UDFs + +* [kruskal_wallis_udf](#kruskal_wallisarrstructfactor-string-val-float64) + +## Documentation + +### [kruskal_wallis(arr(struct(factor STRING, val FLOAT64))](kruskal_wallis.sql) +Takes an array of struct where each struct (point) represents a measurement, with a group label and a measurement value + +The [Kruskal–Wallis test by ranks](https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance), Kruskal–Wallis H test (named after William Kruskal and W. Allen Wallis), or one-way ANOVA on ranks is a non-parametric method for testing whether samples originate from the same distribution. It is used for comparing two or more independent samples of equal or different sample sizes. It extends the Mann–Whitney U test, which is used for comparing only two groups. The parametric equivalent of the Kruskal–Wallis test is the one-way analysis of variance (ANOVA). + +* Input: array: struct +* Output: struct +```sql +DECLARE data ARRAY>; + +set data = [ +('a',1.0), +('b',2.0), +('c',2.3), +('a',1.4), +('b',2.2), +('c',5.5), +('a',1.0), +('b',2.3), +('c',2.3), +('a',1.1), +('b',7.2), +('c',2.8) +]; + + +SELECT `lib_stats.kruskal_wallis_udf`(data) as results; +``` + +results: + +| results.H | results.p | results.DoF | +|-----------|-----------|-------------| +| 3.4230769 | 0.1805877 | 2 | \ No newline at end of file diff --git a/udfs/statslib/kruskal_wallis.sql b/udfs/statslib/kruskal_wallis.sql new file mode 100644 index 000000000..1a405ef7a --- /dev/null +++ b/udfs/statslib/kruskal_wallis.sql @@ -0,0 +1,36 @@ +/* + * Copyright 2019 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CREATE OR REPLACE FUNCTION st.kruskal_wallis(data ARRAY>) AS (( + with H_raw AS( + with sums AS + ( + with rank_data AS + ( + select d.factor AS f, d.val AS v, rank() over(order by d.val) AS r + from unnest(data) AS d + ) #rank_data + select + SUM(r) * SUM(r) / COUNT(*) AS sumranks, COUNT(*) AS n + from rank_data + GROUP BY f + ) # sums + SELECT 12.00 /(SUM(n) *(SUM(n) + 1)) * SUM(sumranks) -(3.00 *(SUM(n) + 1)) AS H, + count(n) -1 AS DoF + FROM sums + ) # H_raw + SELECT struct(H AS H, st.pvalue(H, DoF) AS p, DoF AS DoF) from H_raw +)); \ No newline at end of file diff --git a/udfs/statslib/pvalue.sql b/udfs/statslib/pvalue.sql new file mode 100644 index 000000000..bd542ab19 --- /dev/null +++ b/udfs/statslib/pvalue.sql @@ -0,0 +1,18 @@ +#standardSQL + +/* +## clone git repot +> git clone https://github.com/jstat/jstat.git + +## sync jstat to GCS as +> gsutil rsync -r ./jstat gs://bq-stats-test-jslib/jstat + +*/ + +CREATE OR REPLACE FUNCTION st.pvalue(H FLOAT64, dof INT64) +RETURNS NUMERIC +LANGUAGE js AS """ + return 1.0 - jStat['chisquare'].cdf(H, dof) +""" +OPTIONS ( + library="gs://isb_nih/jstat/dist/jstat.js"); \ No newline at end of file diff --git a/udfs/statslib/test_cases.yaml b/udfs/statslib/test_cases.yaml new file mode 100644 index 000000000..f1443c9d1 --- /dev/null +++ b/udfs/statslib/test_cases.yaml @@ -0,0 +1,4 @@ +pvalue: + - test: + input: CAST(0.22222222 AS FLOAT64), CAST(0.88888888 AS FLOAT64) + output:0.22222222 \ No newline at end of file diff --git a/udfs/tests/udf_test_utils.py b/udfs/tests/udf_test_utils.py index ce00feac1..03c146479 100644 --- a/udfs/tests/udf_test_utils.py +++ b/udfs/tests/udf_test_utils.py @@ -31,6 +31,7 @@ 'teradata': 'td_test', 'vertica': 've_test', 'community': 'fn_test', + 'statslib': 'st_test', } UDF_PARENT_DIR = 'udfs/' @@ -81,10 +82,13 @@ def replace_with_test_datasets(udf_path=None, project_id=None, udf_sql=None): with open(udf_path) as udf_file: udf_sql = udf_file.read() udf_length_before_replacement = len(udf_sql) + print("project_id: " + project_id) + print("udf_sql: " + udf_sql) udf_sql = re.sub( r'(\w+\.)?(?P\w+)(?P\.\w+)\(', f'`{project_id}.\\g_test_{os.getenv("SHORT_SHA")}\\g`(', udf_sql) + print("udf_sql: " + udf_sql) if udf_length_before_replacement == len(udf_sql): return None else: From 2705f30efc89d84673d845aa2f5f909176707374 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Wed, 5 Aug 2020 20:41:04 +0000 Subject: [PATCH 002/104] Interim update. --- .gitignore | 1 + udfs/statslib/README.md | 4 ++-- udfs/statslib/pvalue.sql | 2 +- udfs/statslib/test_cases.yaml | 10 ++++++++-- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index af74a00cf..3ca0c2bf1 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ hs_err_pid* target/ tools/unsupervised_dataset/sql_crawler/__pycache__/__init__.cpython-38.pyc key.json +BigQueryUtils.code-workspace diff --git a/udfs/statslib/README.md b/udfs/statslib/README.md index 9f60eea47..b5ab7076a 100644 --- a/udfs/statslib/README.md +++ b/udfs/statslib/README.md @@ -13,7 +13,7 @@ SELECT bqutil.fn.int(1.684) ## UDFs -* [kruskal_wallis_udf](#kruskal_wallisarrstructfactor-string-val-float64) +* [kruskal_wallis](#kruskal_wallisarrstructfactor-string-val-float64) ## Documentation @@ -43,7 +43,7 @@ set data = [ ]; -SELECT `lib_stats.kruskal_wallis_udf`(data) as results; +SELECT `lib_stats.kruskal_wallis`(data) as results; ``` results: diff --git a/udfs/statslib/pvalue.sql b/udfs/statslib/pvalue.sql index bd542ab19..d90d1f725 100644 --- a/udfs/statslib/pvalue.sql +++ b/udfs/statslib/pvalue.sql @@ -10,7 +10,7 @@ */ CREATE OR REPLACE FUNCTION st.pvalue(H FLOAT64, dof INT64) -RETURNS NUMERIC +RETURNS FLOAT64 LANGUAGE js AS """ return 1.0 - jStat['chisquare'].cdf(H, dof) """ diff --git a/udfs/statslib/test_cases.yaml b/udfs/statslib/test_cases.yaml index f1443c9d1..c5d5bb3a4 100644 --- a/udfs/statslib/test_cases.yaml +++ b/udfs/statslib/test_cases.yaml @@ -1,4 +1,10 @@ pvalue: - test: - input: CAST(0.22222222 AS FLOAT64), CAST(0.88888888 AS FLOAT64) - output:0.22222222 \ No newline at end of file + input: CAST(0.3 AS FLOAT64), CAST(2 AS INT64) + expected_output: CAST(0.8607079764250578 AS FLOAT64) +kruskal_wallis: + - test: + input: [('a',1.0), ('b',2.0), ('c',2.3), ('a',1.4), ('b',2.2), ('c',5.5), ('a',1.0), ('b',2.3), ('c',2.3), ('a',1.1), ('b',7.2), ('c',2.8)] + + + expected_output: STRUCT(1, 2, 3) \ No newline at end of file From 3321b42dbf7b431941b7fe44a4a26c43818ef165 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 6 Aug 2020 15:58:04 +0000 Subject: [PATCH 003/104] Fixed KW test cases. --- udfs/statslib/test_cases.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/udfs/statslib/test_cases.yaml b/udfs/statslib/test_cases.yaml index c5d5bb3a4..cfdc793e0 100644 --- a/udfs/statslib/test_cases.yaml +++ b/udfs/statslib/test_cases.yaml @@ -4,7 +4,5 @@ pvalue: expected_output: CAST(0.8607079764250578 AS FLOAT64) kruskal_wallis: - test: - input: [('a',1.0), ('b',2.0), ('c',2.3), ('a',1.4), ('b',2.2), ('c',5.5), ('a',1.0), ('b',2.3), ('c',2.3), ('a',1.1), ('b',7.2), ('c',2.8)] - - - expected_output: STRUCT(1, 2, 3) \ No newline at end of file + input: (SELECT [('a',1.0), ('b',2.0), ('c',2.3), ('a',1.4), ('b',2.2), ('c',5.5), ('a',1.0), ('b',2.3), ('c',2.3), ('a',1.1), ('b',7.2), ('c',2.8)]) + expected_output: STRUCT(CAST(3.423076923076927 AS FLOAT64) AS H, CAST( 0.1805877514841956 AS FLOAT64) AS p, CAST(2 AS INT64) AS DoF) From 757d08bce888afe57369de54d615a0f8d8f2c3f2 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 1 Sep 2020 16:38:14 +0000 Subject: [PATCH 004/104] Fixed kw. --- udfs/statslib/kruskal_wallis.sql | 2 +- udfs/statslib/linear_regression.sql | 42 +++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 udfs/statslib/linear_regression.sql diff --git a/udfs/statslib/kruskal_wallis.sql b/udfs/statslib/kruskal_wallis.sql index 1a405ef7a..f8fa11251 100644 --- a/udfs/statslib/kruskal_wallis.sql +++ b/udfs/statslib/kruskal_wallis.sql @@ -24,7 +24,7 @@ CREATE OR REPLACE FUNCTION st.kruskal_wallis(data ARRAY>) AS ( + ( + with PRELIM AS( + SELECT SUM(X) AS Sx, SUM(Y) AS Sy, + SUM(X * X) AS Sxx, + SUM(X * Y) AS Sxy, + SUM(Y * Y) AS Syy, + COUNT(*) AS N + FROM ( + SELECT Los AS x, Charge AS Y FROM `Inpatient Confinement` + ) + ) + + +SELECT + ((Sy * Sxx) - (Sx * Sxy) / ((N * (Sxx)) - (Sx * Sx)) AS a, + ((N * Sxy) - (Sx * Sy)) / ((N * Sxx) - (Sx * Sx)) AS b, + ((N * Sxy) - (Sx * Sy)) + / SQRT( + (((N * Sxx) - (Sx * Sx)) + * ((N * Syy - (Sy * Sy))))) AS r + FROM + ( + + ) sums; \ No newline at end of file From 2625f1546d4c1f48b854e1bfdc6cbd1f79353742 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Wed, 16 Sep 2020 19:06:09 +0000 Subject: [PATCH 005/104] Added Linear Regression --- .init | 4 +-- udfs/statslib/linear_regression.sql | 49 +++++++++++++++-------------- udfs/statslib/test_cases.yaml | 4 +++ udfs/tests/run.sh | 1 - 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/.init b/.init index 4910177dd..c6dff5d48 100644 --- a/.init +++ b/.init @@ -5,8 +5,8 @@ if [ -f ~/.bigquery/bin/activate ]; then . ~/.bigquery/bin/activate fi -if [ -f $(pwd)/key.json ]; then - export GOOGLE_APPLICATION_CREDENTIALS="$(pwd)/key.json" +if [ -f key.json ]; then + export GOOGLE_APPLICATION_CREDENTIALS="key.json" fi gcloud config set project nih-bq diff --git a/udfs/statslib/linear_regression.sql b/udfs/statslib/linear_regression.sql index c15aa145a..ca2264580 100644 --- a/udfs/statslib/linear_regression.sql +++ b/udfs/statslib/linear_regression.sql @@ -15,28 +15,29 @@ */ -CREATE OR REPLACE FUNCTION st.linear_regression(data ARRAY>) AS ( - ( - with PRELIM AS( - SELECT SUM(X) AS Sx, SUM(Y) AS Sy, - SUM(X * X) AS Sxx, - SUM(X * Y) AS Sxy, - SUM(Y * Y) AS Syy, - COUNT(*) AS N - FROM ( - SELECT Los AS x, Charge AS Y FROM `Inpatient Confinement` - ) - ) - -SELECT - ((Sy * Sxx) - (Sx * Sxy) / ((N * (Sxx)) - (Sx * Sx)) AS a, - ((N * Sxy) - (Sx * Sy)) / ((N * Sxx) - (Sx * Sx)) AS b, - ((N * Sxy) - (Sx * Sy)) - / SQRT( - (((N * Sxx) - (Sx * Sx)) - * ((N * Syy - (Sy * Sy))))) AS r - FROM - ( - - ) sums; \ No newline at end of file +CREATE OR REPLACE FUNCTION st.linear_regression(data ARRAY>) +AS (( + WITH results AS ( + WITH sums AS ( + with d as ( + select * from unnest(data) + ) + select + SUM(d.X) as Sx, + SUM(d.Y) as Sy, + SUM(d.X * d.Y) as Sxy, + SUM(d.X * d.X) as Sxx, + SUM(d.Y * d.Y) as Syy, + COUNT(*) as N + from d + ) + SELECT + ((Sy * Sxx) - (Sx * Sxy)) / ((N * (Sxx)) - (Sx * Sx)) AS a, + ((N * Sxy) - (Sx * Sy)) / ((N * Sxx) - (Sx * Sx)) AS b, + ((N * Sxy) - (Sx * Sy))/ SQRT( + (((N * Sxx) - (Sx * Sx))* ((N * Syy - (Sy * Sy))))) AS r + from sums + ) + select STRUCT(a, b, r) from results +)); diff --git a/udfs/statslib/test_cases.yaml b/udfs/statslib/test_cases.yaml index cfdc793e0..f1b356630 100644 --- a/udfs/statslib/test_cases.yaml +++ b/udfs/statslib/test_cases.yaml @@ -6,3 +6,7 @@ kruskal_wallis: - test: input: (SELECT [('a',1.0), ('b',2.0), ('c',2.3), ('a',1.4), ('b',2.2), ('c',5.5), ('a',1.0), ('b',2.3), ('c',2.3), ('a',1.1), ('b',7.2), ('c',2.8)]) expected_output: STRUCT(CAST(3.423076923076927 AS FLOAT64) AS H, CAST( 0.1805877514841956 AS FLOAT64) AS p, CAST(2 AS INT64) AS DoF) +linear_regression: + - test: + input: (SELECT [ (5.1,2.5), (5.0,2.0), (5.7,2.6), (6.0,2.2), (5.8,2.6), (5.5,2.3), (6.1,2.8), (5.5,2.5), (6.4,3.2), (5.6,3.0)]) + expected_output: STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r) \ No newline at end of file diff --git a/udfs/tests/run.sh b/udfs/tests/run.sh index e64dabd52..785a45fb5 100755 --- a/udfs/tests/run.sh +++ b/udfs/tests/run.sh @@ -16,7 +16,6 @@ if [[ $1 == "--pip_install_before_run" ]]; then python3 -m pip install -r udfs/tests/requirements.txt - python3 udfs/tests/udf_test_utils.py --create_test_datasets python3 -m pytest --workers 100 udfs/tests/create_udf_signatures.py python3 -m pytest --workers 100 udfs/tests/test_create_udfs.py python3 -m pytest --workers 100 udfs/tests/test_run_udfs.py From aa18c1375083cddb44230c368a504e8eba93c0dc Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 2 Feb 2021 14:58:26 -0800 Subject: [PATCH 006/104] Use jstat from bq_js_libs --- udfs/statslib/pvalue.sql | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/udfs/statslib/pvalue.sql b/udfs/statslib/pvalue.sql index d90d1f725..6d0ac1d9b 100644 --- a/udfs/statslib/pvalue.sql +++ b/udfs/statslib/pvalue.sql @@ -1,18 +1,9 @@ #standardSQL -/* -## clone git repot -> git clone https://github.com/jstat/jstat.git - -## sync jstat to GCS as -> gsutil rsync -r ./jstat gs://bq-stats-test-jslib/jstat - -*/ - CREATE OR REPLACE FUNCTION st.pvalue(H FLOAT64, dof INT64) RETURNS FLOAT64 LANGUAGE js AS """ return 1.0 - jStat['chisquare'].cdf(H, dof) """ OPTIONS ( - library="gs://isb_nih/jstat/dist/jstat.js"); \ No newline at end of file + library="library=["gs://bqutil-lib/bq_js_libs/jstat-v1.9.4.min.js"]"); \ No newline at end of file From 9a63ff29bb4aa9d88dd950a0eacc40c1ee570408 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 2 Feb 2021 15:52:05 -0800 Subject: [PATCH 007/104] fix typo --- udfs/statslib/pvalue.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udfs/statslib/pvalue.sql b/udfs/statslib/pvalue.sql index 6d0ac1d9b..997198b2b 100644 --- a/udfs/statslib/pvalue.sql +++ b/udfs/statslib/pvalue.sql @@ -6,4 +6,5 @@ LANGUAGE js AS """ return 1.0 - jStat['chisquare'].cdf(H, dof) """ OPTIONS ( - library="library=["gs://bqutil-lib/bq_js_libs/jstat-v1.9.4.min.js"]"); \ No newline at end of file + library=["gs://bqutil-lib/bq_js_libs/jstat-v1.9.4.min.js"] +); \ No newline at end of file From 65279b57454d7fd3ebf880d8b85f7e3734cce86d Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 8 Feb 2021 08:59:55 -0500 Subject: [PATCH 008/104] Update kruskal_wallis.sql --- udfs/statslib/kruskal_wallis.sql | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/udfs/statslib/kruskal_wallis.sql b/udfs/statslib/kruskal_wallis.sql index f8fa11251..af7fbe2d8 100644 --- a/udfs/statslib/kruskal_wallis.sql +++ b/udfs/statslib/kruskal_wallis.sql @@ -15,22 +15,22 @@ */ CREATE OR REPLACE FUNCTION st.kruskal_wallis(data ARRAY>) AS (( - with H_raw AS( - with sums AS + WITH H_raw AS( + WITH sums AS ( - with rank_data AS + WITH rank_data AS ( - select d.factor AS f, d.val AS v, rank() over(order by d.val) AS r - from unnest(data) AS d + SELECT d.factor AS f, d.val AS v, RANK() OVER(ORDER BY d.val) AS r + FROM UNNEST(data) AS d ) #rank_data - select + SELECT SUM(r) * (SUM(r) / COUNT(*)) AS sumranks, COUNT(*) AS n - from rank_data + FROM rank_data GROUP BY f ) # sums SELECT 12.00 /(SUM(n) *(SUM(n) + 1)) * SUM(sumranks) -(3.00 *(SUM(n) + 1)) AS H, - count(n) -1 AS DoF + COUNT(n) -1 AS DoF FROM sums ) # H_raw - SELECT struct(H AS H, st.pvalue(H, DoF) AS p, DoF AS DoF) from H_raw -)); \ No newline at end of file + SELECT struct(H AS H, st.pvalue(H, DoF) AS p, DoF AS DoF) FROM H_raw +)); From 8cc1879680216000b9c22a7d1d4203c984b66bbb Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 8 Feb 2021 09:01:24 -0500 Subject: [PATCH 009/104] Update linear_regression.sql --- udfs/statslib/linear_regression.sql | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/udfs/statslib/linear_regression.sql b/udfs/statslib/linear_regression.sql index ca2264580..f1b69e1f8 100644 --- a/udfs/statslib/linear_regression.sql +++ b/udfs/statslib/linear_regression.sql @@ -20,24 +20,24 @@ CREATE OR REPLACE FUNCTION st.linear_regression(data ARRAY Date: Mon, 8 Feb 2021 16:21:32 -0500 Subject: [PATCH 010/104] Update udfs/statslib/pvalue.sql Co-authored-by: Daniel De Leo --- udfs/statslib/pvalue.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udfs/statslib/pvalue.sql b/udfs/statslib/pvalue.sql index 997198b2b..00b8356e0 100644 --- a/udfs/statslib/pvalue.sql +++ b/udfs/statslib/pvalue.sql @@ -6,5 +6,5 @@ LANGUAGE js AS """ return 1.0 - jStat['chisquare'].cdf(H, dof) """ OPTIONS ( - library=["gs://bqutil-lib/bq_js_libs/jstat-v1.9.4.min.js"] -); \ No newline at end of file + library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] +); From 04acaa40108987f685b0e642d68de4cf78cdf8a2 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 8 Feb 2021 16:24:15 -0500 Subject: [PATCH 011/104] Delete .init --- .init | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 .init diff --git a/.init b/.init deleted file mode 100644 index c6dff5d48..000000000 --- a/.init +++ /dev/null @@ -1,15 +0,0 @@ - -# if lsst virutalenv exists, use it. - -if [ -f ~/.bigquery/bin/activate ]; then - . ~/.bigquery/bin/activate -fi - -if [ -f key.json ]; then - export GOOGLE_APPLICATION_CREDENTIALS="key.json" -fi - -gcloud config set project nih-bq -gcloud config set compute/zone us-central1-f -gcloud config set compute/region us-central1 - From 568cb96d84cf0e323e9dab18e9bbe6d8e013eaa6 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 8 Feb 2021 16:25:09 -0500 Subject: [PATCH 012/104] Update .gitignore --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 3ca0c2bf1..2e4597c8e 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,3 @@ hs_err_pid* .vscode .idea/ target/ -tools/unsupervised_dataset/sql_crawler/__pycache__/__init__.cpython-38.pyc -key.json -BigQueryUtils.code-workspace From 9570dcbec9a46fac30fa83426a807115b4318f1b Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 9 Feb 2021 09:40:26 -0500 Subject: [PATCH 013/104] Update udfs/statslib/kruskal_wallis.sql Co-authored-by: Daniel De Leo --- udfs/statslib/kruskal_wallis.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/statslib/kruskal_wallis.sql b/udfs/statslib/kruskal_wallis.sql index af7fbe2d8..cef316687 100644 --- a/udfs/statslib/kruskal_wallis.sql +++ b/udfs/statslib/kruskal_wallis.sql @@ -14,7 +14,7 @@ * limitations under the License. */ -CREATE OR REPLACE FUNCTION st.kruskal_wallis(data ARRAY>) AS (( +CREATE OR REPLACE FUNCTION fn.kruskal_wallis(data ARRAY>) AS (( WITH H_raw AS( WITH sums AS ( From 14cad124ca885c5419866958b758d9ea96eec608 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 9 Feb 2021 09:40:33 -0500 Subject: [PATCH 014/104] Update udfs/statslib/pvalue.sql Co-authored-by: Daniel De Leo --- udfs/statslib/pvalue.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/statslib/pvalue.sql b/udfs/statslib/pvalue.sql index 00b8356e0..0d73ba2a1 100644 --- a/udfs/statslib/pvalue.sql +++ b/udfs/statslib/pvalue.sql @@ -1,6 +1,6 @@ #standardSQL -CREATE OR REPLACE FUNCTION st.pvalue(H FLOAT64, dof INT64) +CREATE OR REPLACE FUNCTION fn.pvalue(H FLOAT64, dof INT64) RETURNS FLOAT64 LANGUAGE js AS """ return 1.0 - jStat['chisquare'].cdf(H, dof) From 2470574dca4ccd2d2d59e11818a6fde5330b652a Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 9 Feb 2021 09:40:43 -0500 Subject: [PATCH 015/104] Update udfs/statslib/linear_regression.sql Co-authored-by: Daniel De Leo --- udfs/statslib/linear_regression.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/statslib/linear_regression.sql b/udfs/statslib/linear_regression.sql index f1b69e1f8..3b5cda234 100644 --- a/udfs/statslib/linear_regression.sql +++ b/udfs/statslib/linear_regression.sql @@ -16,7 +16,7 @@ -CREATE OR REPLACE FUNCTION st.linear_regression(data ARRAY>) +CREATE OR REPLACE FUNCTION fn.linear_regression(data ARRAY>) AS (( WITH results AS ( WITH sums AS ( From 076d9a3c77f6407658cfac7facf07b235ab65ac3 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 9 Feb 2021 14:49:51 -0500 Subject: [PATCH 016/104] Update udfs/statslib/kruskal_wallis.sql Co-authored-by: Daniel De Leo --- udfs/statslib/kruskal_wallis.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/statslib/kruskal_wallis.sql b/udfs/statslib/kruskal_wallis.sql index cef316687..22d55f6cc 100644 --- a/udfs/statslib/kruskal_wallis.sql +++ b/udfs/statslib/kruskal_wallis.sql @@ -32,5 +32,5 @@ CREATE OR REPLACE FUNCTION fn.kruskal_wallis(data ARRAY Date: Tue, 9 Feb 2021 15:18:30 -0500 Subject: [PATCH 017/104] Update udf_test_utils.py --- udfs/tests/udf_test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/tests/udf_test_utils.py b/udfs/tests/udf_test_utils.py index b099f4097..d2bae32c4 100644 --- a/udfs/tests/udf_test_utils.py +++ b/udfs/tests/udf_test_utils.py @@ -25,6 +25,7 @@ from google.cloud import bigquery + DATASET_SUFFIX = f'_test_{os.getenv("SHORT_SHA")}' # Some javascript libraries have issues with webpack's auto # minifier and therefore must be placed in the set below From c73814529793b0788284a5b131ba281bbbba4fc7 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 9 Feb 2021 20:23:41 +0000 Subject: [PATCH 018/104] reverted file --- udfs/tests/udf_test_utils.py | 288 ++++++----------------------------- 1 file changed, 47 insertions(+), 241 deletions(-) diff --git a/udfs/tests/udf_test_utils.py b/udfs/tests/udf_test_utils.py index b099f4097..e2dd86598 100644 --- a/udfs/tests/udf_test_utils.py +++ b/udfs/tests/udf_test_utils.py @@ -18,97 +18,41 @@ import os from pathlib import Path import re -import json from yaml import load from yaml import SafeLoader from google.cloud import bigquery -DATASET_SUFFIX = f'_test_{os.getenv("SHORT_SHA")}' -# Some javascript libraries have issues with webpack's auto -# minifier and therefore must be placed in the set below -# to instruct webpack to not minify them. -NO_MINIFY_JS_LIBS = { - 'js-levenshtein', +BIGQUERY_TEST_DATASET_MAPPINGS = { + 'netezza': 'nz_test', + 'oracle': 'or_test', + 'redshift': 'rs_test', + 'snowflake': 'sf_test', + 'teradata': 'td_test', + 'vertica': 've_test', + 'community': 'fn_test', + 'statslib': 'st_test', } - -def get_dir_to_dataset_mappings(): - bq_datasets_yaml_path = Path('./dir_to_dataset_map.yaml') - if bq_datasets_yaml_path.is_file(): - with open(bq_datasets_yaml_path, 'r') as yaml_file: - return load(yaml_file, Loader=SafeLoader) - else: - return None +UDF_PARENT_DIR = 'udfs/' def get_all_udf_paths(): - return glob.glob('./**/*.sql', recursive=True) - - -def get_all_npm_package_config_paths(node_modules_path): - """ - Get all paths to the package.json files for every npm package - specified in the udfs/js_libs/js_libs.yaml file. - - :param node_modules_path: path to the node_modules directory - :return: Set containing paths to package.json files - """ - js_libs_dict = get_js_libs_from_yaml() - js_libs_with_versions = set() - npm_package_config_paths = set() - for lib_name in js_libs_dict: - for version in js_libs_dict.get(lib_name).get('versions'): - js_libs_with_versions.add(f'{lib_name}-v{version}') - for npm_package_config_path in glob.glob( - f'{node_modules_path}/**/package.json'): - npm_package_name = Path(npm_package_config_path).parent.name - if npm_package_name in js_libs_with_versions: - npm_package_config_paths.add(Path(npm_package_config_path)) - return npm_package_config_paths + return glob.glob(UDF_PARENT_DIR + '/**/*.sql', recursive=True) -def load_test_cases(udf_path, udf_name): - """ - For a given path to a UDF, return any test cases for that - UDF. - - :param udf_path: Path to a .sql file containing UDF DDL - :param udf_name: Name of the UDF - :return: Iterable containing all test cases for the UDF - """ - udf_parent_dir = Path(udf_path).parent - yaml_test_data_path = udf_parent_dir / 'test_cases.yaml' +def load_test_cases(udf_path): + udf_dir = Path(udf_path).parent + yaml_test_data_path = udf_dir / 'test_cases.yaml' if yaml_test_data_path.is_file(): with open(yaml_test_data_path, 'r') as yaml_file: - return load(yaml_file, Loader=SafeLoader).get(udf_name) - else: - return None - - -def get_js_libs_from_yaml(): - """ - Get all npm package names from the /udfs/js_libs/js_libs.yaml - file. - - :return: dict representation of the js_libs.yaml file - """ - js_libs_yaml_path = Path('./js_libs/js_libs.yaml') - if js_libs_yaml_path.is_file(): - with open(js_libs_yaml_path, 'r') as yaml_file: return load(yaml_file, Loader=SafeLoader) else: return None def extract_udf_name(udf_path): - """ - Parse out the name of the UDF from the SQL DDL - - :param udf_path: Path to the UDF DDL .sql file - :return: Name of the UDF - """ with open(udf_path) as udf_file: udf_sql = udf_file.read() udf_sql = udf_sql.replace('\n', ' ') @@ -121,204 +65,66 @@ def extract_udf_name(udf_path): return None -def replace_with_null_body(udf_path): - """ - For a given path to a UDF DDL file, parse the SQL and return - the UDF with the body entirely replaced with NULL. - - :param udf_path: Path to the UDF DDL .sql file - :return: Input UDF DDL with a NULL body - """ +def extract_udf_signature(udf_path): with open(udf_path) as udf_file: udf_sql = udf_file.read() udf_sql = udf_sql.replace('\n', ' ') pattern = re.compile(r'FUNCTION\s+(`?.+?`?.*?\).*?\s+)AS') match = pattern.search(udf_sql) if match: - udf_signature = match[1].replace('LANGUAGE js', '') - udf_null_body = (f'CREATE FUNCTION IF NOT EXISTS {udf_signature}' - f' AS (NULL)') - return udf_null_body + udf_name = match[1].replace('LANGUAGE js', '') + return udf_name else: - return None + return udf_path -def replace_with_test_datasets(project_id, udf_sql): - """ - For a given path to a UDF DDL file, parse the SQL and return the UDF - with the dataset name changed with an added suffix for testing. The suffix - value is defined in the global variable, DATASET_SUFFIX, and includes the - commit's SHORT_SHA to prevent different commits from interfering - with UDF builds. - - :param project_id: Project in which to create the UDF - :param udf_sql: SQL DDL of the UDF - :return: Same SQL DDL as input udf_sql but replaced with testing dataset - """ - udf_sql_with_test_dataset = re.sub( +def replace_with_test_datasets(udf_path=None, project_id=None, udf_sql=None): + if udf_path: + with open(udf_path) as udf_file: + udf_sql = udf_file.read() + udf_length_before_replacement = len(udf_sql) + print("project_id: " + project_id) + print("udf_sql: " + udf_sql) + udf_sql = re.sub( r'(\w+\.)?(?P\w+)(?P\.\w+)\(', - f'`{project_id}.\\g{DATASET_SUFFIX}\\g`(', + f'`{project_id}.\\g_test_{os.getenv("SHORT_SHA")}\\g`(', udf_sql) - if udf_sql_with_test_dataset == udf_sql: - # Replacement failed, return None to prevent overwriting prod dataset + print("udf_sql: " + udf_sql) + if udf_length_before_replacement == len(udf_sql): return None else: - return udf_sql_with_test_dataset + return udf_sql -def get_test_bq_dataset(udf_path): - """ - For a given path to a UDF DDL file, return the BigQuery dataset name in - which to test the UDF. The test dataset name is the same as production - but with the suffix, _test_$SHORT_SHA, appended. The $SHORT_SHA value comes - from the commit which triggered the build. - :param udf_path: Path to the UDF DDL .sql file - :return: Name of test dataset - """ - udf_parent_dir_name = Path(udf_path).parent.name - if os.getenv('SHORT_SHA') is not None: - bq_dataset = get_dir_to_dataset_mappings().get(udf_parent_dir_name) - return f'{bq_dataset}{DATASET_SUFFIX}' - else: - return None +def get_target_bq_dataset(udf_path): + parent_dir_name = Path(udf_path).parent.name + return f'{BIGQUERY_TEST_DATASET_MAPPINGS.get(parent_dir_name)}_{os.getenv("SHORT_SHA")}' def delete_datasets(client): - for dataset in get_dir_to_dataset_mappings().values(): - dataset = f'{dataset}{DATASET_SUFFIX}' + for dataset in BIGQUERY_TEST_DATASET_MAPPINGS.values(): + dataset = f'{dataset}_{os.getenv("SHORT_SHA")}' client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) -def create_datasets(client, dataset_suffix=None): - for dataset in get_dir_to_dataset_mappings().values(): - if dataset_suffix: - dataset = f'{dataset}{dataset_suffix}' +def create_datasets(client): + for dataset in BIGQUERY_TEST_DATASET_MAPPINGS.values(): + dataset = f'{dataset}_{os.getenv("SHORT_SHA")}' client.create_dataset(dataset, exists_ok=True) -def generate_js_libs_package_json(): - """ - This dynamically generates the main package.json which will be used to build - all the js libs that are specified in the udfs/js_libs/js_libs.yaml file. - """ - js_libs_dict = get_js_libs_from_yaml() - js_libs_package_dict = { - "name": "js-bq-libs", - "version": "1.0.0", - "scripts": { - "build-all-libs": "concurrently \"npm:webpack-*\"" - }, - "dependencies": { - f'{lib_name}-v{version}': f'npm:{lib_name}@^{version}' - for lib_name in js_libs_dict - for version in js_libs_dict.get(lib_name).get('versions') - }, - "devDependencies": { - "webpack": "^5.3.1", - "webpack-cli": "^4.1.0", - "concurrently": "^5.3.0" - } - } - # Update with webpack scripts for building all js packages - for lib_name in js_libs_dict: - for version in js_libs_dict.get(lib_name).get('versions'): - js_libs_package_dict.get('scripts').update({ - f'webpack-{lib_name}-v{version}': f'webpack --config {lib_name}-v{version}-webpack.config.js' - }) - - with open('./package.json', 'w') as js_libs_package_json: - js_libs_package_json.write(json.dumps(js_libs_package_dict, indent=2)) - - -def generate_webpack_configs(): - """ - This dynamically generates all the webpack config files needed - to build the single-file js libs which are specified in the - udfs/js_libs/js_libs.yaml file. - See https://webpack.js.org/concepts/configuration/ for more information - on webpack config files. - """ - node_modules_path = Path('./node_modules') - npm_package_config_paths = get_all_npm_package_config_paths( - node_modules_path) - for npm_package_config_path in npm_package_config_paths: - with open(npm_package_config_path) as npm_package_config: - npm_package_json = json.loads(npm_package_config.read()) - # Check for js main entrypoint - # https://docs.npmjs.com/cli/v6/configuring-npm/package-json#main - # If no main entrypoint found, check for a single dependency file - # https://docs.npmjs.com/cli/v6/configuring-npm/package-json#files - js_main_entrypoint = npm_package_json.get('main') - js_dependency_files = npm_package_json.get('files') - js_lib_name = npm_package_json.get('name') - js_lib_version = npm_package_json.get('version') - if js_main_entrypoint is not None: - js_main_entrypoint_path = npm_package_config_path.parent / Path( - js_main_entrypoint) - elif len(js_dependency_files) == 1: - js_main_entrypoint_path = npm_package_config_path.parent / Path( - js_dependency_files[0]) - webpack_config_file_path = Path( - f'{npm_package_config_path.parent.name}-webpack.config.js') - minimize_js = True if js_lib_name not in NO_MINIFY_JS_LIBS else False - js_lib_file_extension = ".min.js" if minimize_js else ".js" - with open(webpack_config_file_path, 'w') as webpack_config: - webpack_config.write( - f'var path = require("path");\n' - f'module.exports = {{\n' - f' entry: "./{js_main_entrypoint_path}",\n' - f' output: {{\n' - f' path: path.resolve(__dirname, "js_builds"),\n' - f' filename: "{js_lib_name}-v{js_lib_version}{js_lib_file_extension}",\n' - f' library: "{js_lib_name.replace("-", "_")}",\n' - f' libraryTarget: "var",\n' - f' }},\n' - f' optimization: {{\n' - f' minimize: {"true" if minimize_js else "false"}\n' - f' }},\n' - f' mode: "production",\n' - f'}};') - - def main(): - parser = argparse.ArgumentParser( - description='Utils Class to support testing BigQuery UDFs') - parser.add_argument( - '--create-prod-datasets', - help='Create prod datasets used for UDF function testing.', - action='store_true') - parser.add_argument( - '--create-test-datasets', - help='Create test datasets used for UDF function testing.', - action='store_true') - parser.add_argument( - '--delete-test-datasets', - help='Delete test datasets used for UDF function testing.', - action='store_true') - parser.add_argument( - '--generate-js-libs-package-json', - help='Generate package.json file necessary for building ' - 'javascript libs for BigQuery UDFs', - action='store_true') - parser.add_argument( - '--generate-webpack-configs', - help='Generate webpack config files necessary for building ' - 'javascript libs for BigQuery UDFs', - action='store_true') + parser = argparse.ArgumentParser(description='Utils Class to support testing BigQuery UDFs') + parser.add_argument('--create_test_datasets', help='Create test datasets used for UDF function testing.', + action='store_true') + parser.add_argument('--delete_test_datasets', help='Delete test datasets used for UDF function testing.', + action='store_true') args = parser.parse_args() - - bq_project_id = os.getenv('BQ_PROJECT_ID') - if args.create_prod_datasets: - create_datasets(bigquery.Client(project=bq_project_id)) - elif args.create_test_datasets: - create_datasets(bigquery.Client(project=bq_project_id), - dataset_suffix=DATASET_SUFFIX) + client = bigquery.Client() + if args.create_test_datasets: + create_datasets(client) elif args.delete_test_datasets: - delete_datasets(bigquery.Client(project=bq_project_id)) - elif args.generate_js_libs_package_json: - generate_js_libs_package_json() - elif args.generate_webpack_configs: - generate_webpack_configs() + delete_datasets(client) if __name__ == '__main__': From 8457323c9411caab3b6a2f82f1a10bdf8eaf5933 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 9 Feb 2021 16:12:11 -0500 Subject: [PATCH 019/104] Delete key.json --- key.json | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 key.json diff --git a/key.json b/key.json deleted file mode 100644 index 15a569d65..000000000 --- a/key.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "type": "service_account", - "project_id": "nih-bq", - "private_key_id": "350b8626ac6bff2e3c0b580352a394d39a044ad0", - "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEugIBADANBgkqhkiG9w0BAQEFAASCBKQwggSgAgEAAoIBAQDFXCD+OQdK5jUK\nEqp4Te1+fmuwSYPUZAreBZtNqEFsNv7/TLuLbqAPNUxcP/1RlpSQ19aOc6cjIUPn\n09R+lkWbhdlbE5K+WsL5LbOJqzXbGDDjQJE8Vpta6Qz41Y0GQOX1iIItTcz/db6O\nGDQ1YHKHOZoGR+6V0J5y3vpF96IeFNUn4JMB8po5WWBhDwmNJOSVpTcZuQ0AisYk\nNWBvPNK2eNZGpCvnYo+sxXCOqgpYY7LuSw3e7OOCIyyadyn1ouGyiRXqb/UR9vGN\noDYJyyff5f3qE9uMM7lfUQ7j0Fg2ZOf52aBlciCE+hNEDiIE1IenWOjJ9cl+/fsR\nT7D+4VUrAgMBAAECgf9C0EMFzZ3/vUTiJBo8oCUcAwXHUxv4yzrima358abe+79Y\nA/d5507zLG1YIvxJ+AWpA2j00N0ZsilK/RZIb0HMD4Qwpm+yAbKqqM4fIoHDqmzi\ngk6oyVPOyGxGfAVCGLZrvnWo4K3bI1a0xCeTP74ws/obe5P0eVaAN6ctXf2EMPCL\ndGztLWkKrbqSLN44pyMUqbhFrAivbg+ErlPH1N1IAZEvAfHB0xEtUcZvL0x5F962\nPvgFD2cQspnFxhkEL89mPD8Pgf1j7+Ntwp81x1lgRh0F0HW9lGTk9KlOP5TY+Ykc\n7p1mmn1SQReMX0MwRD7BAVCAcuHB3z5vb1LT2yECgYEA9QulrA25lfiEnP5ZrFty\nR7oGcz0MYTU9Y6/PhR7hqQbij5pmtBrf2Xgr71dsQxMmoydxIVsirFR+hRbN6hAd\nWYIuCUy0q1SHbCw/pHRqxpS1yJGXQkzp5F9T3s6xDJH0uqprKRbDEoTo5NjRigjI\n/OBvvjAwBAcmoCsVDGFf3NUCgYEAzi7B4eOsNNY620N78a8soZVRkdaYExunvDUp\nREn6vA96K4cYqL1whfRxRt3CgWOWohNPDTTvXjFJFl8b2SG6OqRn6UfP0O7h5cbY\nisdkos5FaKd4OBnBjq07i91ED0wmLwUwQSejkDER7gRzTnOXD6O1w7AiLTqJkSFF\nM5gNaf8CgYB2kZctZEq7bU/FozECvBDU8gcy9YQq+RbmSGiSUjoAjotLzvsJBMPx\ntzsTCWsiHX8hjPEpoa5qUkwR29HxArEg4WOjLlFxYWpRhiuZSJdt1QEMtp3I/+xg\ni1vKwJpcqbcQJ2vIqvHi4RrLuLnTH6MY8QRAs5JYMJmZAJ+jqO44rQKBgGLJNEuu\nccu++OQlKTVa4N9qU3K0hphhH7SlI+h8KY4BPF+ukroGSetkewGhLpgfxwRQjAef\n3TAcHRCC8SzPZBLEwqSaq49HZSRaaVXUwvreKZNC6e63KG3bwjP5B6rsqT7OFZii\nDVQArr6Dk1t7Nl1f6q6frUBOU6sdbbKSn5qxAoGAHjvIdGMIv83RQZjEi9v9n30G\nkUZoazHXFPKQyGdK2g9GkALm2wIOvHAGsa+0pXCx9QbxbBElhUCs0eXQBpFC+r8e\nKeuM8TihWGvu9AXEKhzEHwm0zICJc8wk1W/+bRB/O6yZPZ55Q1iydhxGyLf644G1\nJsbbWq4EnmRwqzPUdOM=\n-----END PRIVATE KEY-----\n", - "client_email": "nih-bq@nih-bq.iam.gserviceaccount.com", - "client_id": "108143622486451250165", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/nih-bq%40nih-bq.iam.gserviceaccount.com" -} From bbc53a8c7f858565ac1d3ba8bbb711229a8e3d5a Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 9 Feb 2021 21:31:54 +0000 Subject: [PATCH 020/104] Moved statslib to community --- udfs/community/README.md | 58 +++++++++++++++++++ .../kruskal_wallis.sql | 0 .../linear_regression.sql | 0 udfs/{statslib => community}/pvalue.sql | 0 udfs/community/test_cases.yaml | 15 +++++ udfs/statslib/README.md | 53 ----------------- udfs/statslib/test_cases.yaml | 12 ---- 7 files changed, 73 insertions(+), 65 deletions(-) rename udfs/{statslib => community}/kruskal_wallis.sql (100%) rename udfs/{statslib => community}/linear_regression.sql (100%) rename udfs/{statslib => community}/pvalue.sql (100%) delete mode 100644 udfs/statslib/README.md delete mode 100644 udfs/statslib/test_cases.yaml diff --git a/udfs/community/README.md b/udfs/community/README.md index 7efac7c59..420f51dc6 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -586,3 +586,61 @@ returns: | 4 | 40 | 6.324555320336759 | | 5 | 50 | 12.649110640673518 | + +
+
+
+ +# StatsLib: Statistical UDFs + +This directory contains community contributed [user-defined functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions) for Statistical Analysis +to extend BigQuery for more specialized usage patterns. Each UDF within this +directory will be automatically synchronized to the `bqutil` project within the +`fn` dataset for reference in queries. + +For example, if you'd like to reference the `int` function within your query, +you can reference it like the following: +```sql +SELECT bqutil.fn.int(1.684) +``` + +## UDFs + +* [kruskal_wallis](#kruskal_wallisarrstructfactor-string-val-float64) + +## Documentation + +### [kruskal_wallis(arr(struct(factor STRING, val FLOAT64))](kruskal_wallis.sql) +Takes an array of struct where each struct (point) represents a measurement, with a group label and a measurement value + +The [Kruskal–Wallis test by ranks](https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance), Kruskal–Wallis H test (named after William Kruskal and W. Allen Wallis), or one-way ANOVA on ranks is a non-parametric method for testing whether samples originate from the same distribution. It is used for comparing two or more independent samples of equal or different sample sizes. It extends the Mann–Whitney U test, which is used for comparing only two groups. The parametric equivalent of the Kruskal–Wallis test is the one-way analysis of variance (ANOVA). + +* Input: array: struct +* Output: struct +```sql +DECLARE data ARRAY>; + +set data = [ +('a',1.0), +('b',2.0), +('c',2.3), +('a',1.4), +('b',2.2), +('c',5.5), +('a',1.0), +('b',2.3), +('c',2.3), +('a',1.1), +('b',7.2), +('c',2.8) +]; + + +SELECT `lib_stats.kruskal_wallis`(data) as results; +``` + +results: + +| results.H | results.p | results.DoF | +|-----------|-----------|-------------| +| 3.4230769 | 0.1805877 | 2 | \ No newline at end of file diff --git a/udfs/statslib/kruskal_wallis.sql b/udfs/community/kruskal_wallis.sql similarity index 100% rename from udfs/statslib/kruskal_wallis.sql rename to udfs/community/kruskal_wallis.sql diff --git a/udfs/statslib/linear_regression.sql b/udfs/community/linear_regression.sql similarity index 100% rename from udfs/statslib/linear_regression.sql rename to udfs/community/linear_regression.sql diff --git a/udfs/statslib/pvalue.sql b/udfs/community/pvalue.sql similarity index 100% rename from udfs/statslib/pvalue.sql rename to udfs/community/pvalue.sql diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index 74afd3bff..a41816c9e 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -389,3 +389,18 @@ nlp_compromise_people: - test: input: CAST("Randal Kieth Orton and Dwayne 'the rock' Johnson had a really funny fight." AS STRING) expected_output: CAST(['randal kieth orton', 'dwayne the rock johnson'] AS ARRAY) +# +# Below targets StatsLib work +# +pvalue: + - test: + input: CAST(0.3 AS FLOAT64), CAST(2 AS INT64) + expected_output: CAST(0.8607079764250578 AS FLOAT64) +kruskal_wallis: + - test: + input: (SELECT [('a',1.0), ('b',2.0), ('c',2.3), ('a',1.4), ('b',2.2), ('c',5.5), ('a',1.0), ('b',2.3), ('c',2.3), ('a',1.1), ('b',7.2), ('c',2.8)]) + expected_output: STRUCT(CAST(3.423076923076927 AS FLOAT64) AS H, CAST( 0.1805877514841956 AS FLOAT64) AS p, CAST(2 AS INT64) AS DoF) +linear_regression: + - test: + input: (SELECT [ (5.1,2.5), (5.0,2.0), (5.7,2.6), (6.0,2.2), (5.8,2.6), (5.5,2.3), (6.1,2.8), (5.5,2.5), (6.4,3.2), (5.6,3.0)]) + expected_output: STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r) \ No newline at end of file diff --git a/udfs/statslib/README.md b/udfs/statslib/README.md deleted file mode 100644 index b5ab7076a..000000000 --- a/udfs/statslib/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# Statistical UDFs - -This directory contains community contributed [user-defined functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions) for Statistical Analysis -to extend BigQuery for more specialized usage patterns. Each UDF within this -directory will be automatically synchronized to the `bqutil` project within the -`fn` dataset for reference in queries. - -For example, if you'd like to reference the `int` function within your query, -you can reference it like the following: -```sql -SELECT bqutil.fn.int(1.684) -``` - -## UDFs - -* [kruskal_wallis](#kruskal_wallisarrstructfactor-string-val-float64) - -## Documentation - -### [kruskal_wallis(arr(struct(factor STRING, val FLOAT64))](kruskal_wallis.sql) -Takes an array of struct where each struct (point) represents a measurement, with a group label and a measurement value - -The [Kruskal–Wallis test by ranks](https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance), Kruskal–Wallis H test (named after William Kruskal and W. Allen Wallis), or one-way ANOVA on ranks is a non-parametric method for testing whether samples originate from the same distribution. It is used for comparing two or more independent samples of equal or different sample sizes. It extends the Mann–Whitney U test, which is used for comparing only two groups. The parametric equivalent of the Kruskal–Wallis test is the one-way analysis of variance (ANOVA). - -* Input: array: struct -* Output: struct -```sql -DECLARE data ARRAY>; - -set data = [ -('a',1.0), -('b',2.0), -('c',2.3), -('a',1.4), -('b',2.2), -('c',5.5), -('a',1.0), -('b',2.3), -('c',2.3), -('a',1.1), -('b',7.2), -('c',2.8) -]; - - -SELECT `lib_stats.kruskal_wallis`(data) as results; -``` - -results: - -| results.H | results.p | results.DoF | -|-----------|-----------|-------------| -| 3.4230769 | 0.1805877 | 2 | \ No newline at end of file diff --git a/udfs/statslib/test_cases.yaml b/udfs/statslib/test_cases.yaml deleted file mode 100644 index f1b356630..000000000 --- a/udfs/statslib/test_cases.yaml +++ /dev/null @@ -1,12 +0,0 @@ -pvalue: - - test: - input: CAST(0.3 AS FLOAT64), CAST(2 AS INT64) - expected_output: CAST(0.8607079764250578 AS FLOAT64) -kruskal_wallis: - - test: - input: (SELECT [('a',1.0), ('b',2.0), ('c',2.3), ('a',1.4), ('b',2.2), ('c',5.5), ('a',1.0), ('b',2.3), ('c',2.3), ('a',1.1), ('b',7.2), ('c',2.8)]) - expected_output: STRUCT(CAST(3.423076923076927 AS FLOAT64) AS H, CAST( 0.1805877514841956 AS FLOAT64) AS p, CAST(2 AS INT64) AS DoF) -linear_regression: - - test: - input: (SELECT [ (5.1,2.5), (5.0,2.0), (5.7,2.6), (6.0,2.2), (5.8,2.6), (5.5,2.3), (6.1,2.8), (5.5,2.5), (6.4,3.2), (5.6,3.0)]) - expected_output: STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r) \ No newline at end of file From f7c6d773205909255171cceca0855984be8e4331 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Fri, 12 Feb 2021 21:11:55 +0000 Subject: [PATCH 021/104] trying to revert again --- udfs/tests/udf_test_utils.py | 291 +++++++++++++++++++++++++++++------ 1 file changed, 243 insertions(+), 48 deletions(-) diff --git a/udfs/tests/udf_test_utils.py b/udfs/tests/udf_test_utils.py index e2dd86598..b7f7bd483 100644 --- a/udfs/tests/udf_test_utils.py +++ b/udfs/tests/udf_test_utils.py @@ -18,41 +18,98 @@ import os from pathlib import Path import re +import json from yaml import load from yaml import SafeLoader from google.cloud import bigquery -BIGQUERY_TEST_DATASET_MAPPINGS = { - 'netezza': 'nz_test', - 'oracle': 'or_test', - 'redshift': 'rs_test', - 'snowflake': 'sf_test', - 'teradata': 'td_test', - 'vertica': 've_test', - 'community': 'fn_test', - 'statslib': 'st_test', + +DATASET_SUFFIX = f'_test_{os.getenv("SHORT_SHA")}' +# Some javascript libraries have issues with webpack's auto +# minifier and therefore must be placed in the set below +# to instruct webpack to not minify them. +NO_MINIFY_JS_LIBS = { + 'js-levenshtein', } -UDF_PARENT_DIR = 'udfs/' + +def get_dir_to_dataset_mappings(): + bq_datasets_yaml_path = Path('./dir_to_dataset_map.yaml') + if bq_datasets_yaml_path.is_file(): + with open(bq_datasets_yaml_path, 'r') as yaml_file: + return load(yaml_file, Loader=SafeLoader) + else: + return None def get_all_udf_paths(): - return glob.glob(UDF_PARENT_DIR + '/**/*.sql', recursive=True) + return glob.glob('./**/*.sql', recursive=True) + + +def get_all_npm_package_config_paths(node_modules_path): + """ + Get all paths to the package.json files for every npm package + specified in the udfs/js_libs/js_libs.yaml file. + :param node_modules_path: path to the node_modules directory + :return: Set containing paths to package.json files + """ + js_libs_dict = get_js_libs_from_yaml() + js_libs_with_versions = set() + npm_package_config_paths = set() + for lib_name in js_libs_dict: + for version in js_libs_dict.get(lib_name).get('versions'): + js_libs_with_versions.add(f'{lib_name}-v{version}') + for npm_package_config_path in glob.glob( + f'{node_modules_path}/**/package.json'): + npm_package_name = Path(npm_package_config_path).parent.name + if npm_package_name in js_libs_with_versions: + npm_package_config_paths.add(Path(npm_package_config_path)) + return npm_package_config_paths -def load_test_cases(udf_path): - udf_dir = Path(udf_path).parent - yaml_test_data_path = udf_dir / 'test_cases.yaml' + +def load_test_cases(udf_path, udf_name): + """ + For a given path to a UDF, return any test cases for that + UDF. + + :param udf_path: Path to a .sql file containing UDF DDL + :param udf_name: Name of the UDF + :return: Iterable containing all test cases for the UDF + """ + udf_parent_dir = Path(udf_path).parent + yaml_test_data_path = udf_parent_dir / 'test_cases.yaml' if yaml_test_data_path.is_file(): with open(yaml_test_data_path, 'r') as yaml_file: + return load(yaml_file, Loader=SafeLoader).get(udf_name) + else: + return None + + +def get_js_libs_from_yaml(): + """ + Get all npm package names from the /udfs/js_libs/js_libs.yaml + file. + + :return: dict representation of the js_libs.yaml file + """ + js_libs_yaml_path = Path('./js_libs/js_libs.yaml') + if js_libs_yaml_path.is_file(): + with open(js_libs_yaml_path, 'r') as yaml_file: return load(yaml_file, Loader=SafeLoader) else: return None def extract_udf_name(udf_path): + """ + Parse out the name of the UDF from the SQL DDL + + :param udf_path: Path to the UDF DDL .sql file + :return: Name of the UDF + """ with open(udf_path) as udf_file: udf_sql = udf_file.read() udf_sql = udf_sql.replace('\n', ' ') @@ -65,67 +122,205 @@ def extract_udf_name(udf_path): return None -def extract_udf_signature(udf_path): +def replace_with_null_body(udf_path): + """ + For a given path to a UDF DDL file, parse the SQL and return + the UDF with the body entirely replaced with NULL. + + :param udf_path: Path to the UDF DDL .sql file + :return: Input UDF DDL with a NULL body + """ with open(udf_path) as udf_file: udf_sql = udf_file.read() udf_sql = udf_sql.replace('\n', ' ') pattern = re.compile(r'FUNCTION\s+(`?.+?`?.*?\).*?\s+)AS') match = pattern.search(udf_sql) if match: - udf_name = match[1].replace('LANGUAGE js', '') - return udf_name + udf_signature = match[1].replace('LANGUAGE js', '') + udf_null_body = (f'CREATE FUNCTION IF NOT EXISTS {udf_signature}' + f' AS (NULL)') + return udf_null_body else: - return udf_path + return None -def replace_with_test_datasets(udf_path=None, project_id=None, udf_sql=None): - if udf_path: - with open(udf_path) as udf_file: - udf_sql = udf_file.read() - udf_length_before_replacement = len(udf_sql) - print("project_id: " + project_id) - print("udf_sql: " + udf_sql) - udf_sql = re.sub( +def replace_with_test_datasets(project_id, udf_sql): + """ + For a given path to a UDF DDL file, parse the SQL and return the UDF + with the dataset name changed with an added suffix for testing. The suffix + value is defined in the global variable, DATASET_SUFFIX, and includes the + commit's SHORT_SHA to prevent different commits from interfering + with UDF builds. + + :param project_id: Project in which to create the UDF + :param udf_sql: SQL DDL of the UDF + :return: Same SQL DDL as input udf_sql but replaced with testing dataset + """ + udf_sql_with_test_dataset = re.sub( r'(\w+\.)?(?P\w+)(?P\.\w+)\(', - f'`{project_id}.\\g_test_{os.getenv("SHORT_SHA")}\\g`(', + f'`{project_id}.\\g{DATASET_SUFFIX}\\g`(', udf_sql) - print("udf_sql: " + udf_sql) - if udf_length_before_replacement == len(udf_sql): + if udf_sql_with_test_dataset == udf_sql: + # Replacement failed, return None to prevent overwriting prod dataset return None else: - return udf_sql + return udf_sql_with_test_dataset -def get_target_bq_dataset(udf_path): - parent_dir_name = Path(udf_path).parent.name - return f'{BIGQUERY_TEST_DATASET_MAPPINGS.get(parent_dir_name)}_{os.getenv("SHORT_SHA")}' +def get_test_bq_dataset(udf_path): + """ + For a given path to a UDF DDL file, return the BigQuery dataset name in + which to test the UDF. The test dataset name is the same as production + but with the suffix, _test_$SHORT_SHA, appended. The $SHORT_SHA value comes + from the commit which triggered the build. + :param udf_path: Path to the UDF DDL .sql file + :return: Name of test dataset + """ + udf_parent_dir_name = Path(udf_path).parent.name + if os.getenv('SHORT_SHA') is not None: + bq_dataset = get_dir_to_dataset_mappings().get(udf_parent_dir_name) + return f'{bq_dataset}{DATASET_SUFFIX}' + else: + return None def delete_datasets(client): - for dataset in BIGQUERY_TEST_DATASET_MAPPINGS.values(): - dataset = f'{dataset}_{os.getenv("SHORT_SHA")}' + for dataset in get_dir_to_dataset_mappings().values(): + dataset = f'{dataset}{DATASET_SUFFIX}' client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) -def create_datasets(client): - for dataset in BIGQUERY_TEST_DATASET_MAPPINGS.values(): - dataset = f'{dataset}_{os.getenv("SHORT_SHA")}' +def create_datasets(client, dataset_suffix=None): + for dataset in get_dir_to_dataset_mappings().values(): + if dataset_suffix: + dataset = f'{dataset}{dataset_suffix}' client.create_dataset(dataset, exists_ok=True) +def generate_js_libs_package_json(): + """ + This dynamically generates the main package.json which will be used to build + all the js libs that are specified in the udfs/js_libs/js_libs.yaml file. + """ + js_libs_dict = get_js_libs_from_yaml() + js_libs_package_dict = { + "name": "js-bq-libs", + "version": "1.0.0", + "scripts": { + "build-all-libs": "concurrently \"npm:webpack-*\"" + }, + "dependencies": { + f'{lib_name}-v{version}': f'npm:{lib_name}@^{version}' + for lib_name in js_libs_dict + for version in js_libs_dict.get(lib_name).get('versions') + }, + "devDependencies": { + "webpack": "^5.3.1", + "webpack-cli": "^4.1.0", + "concurrently": "^5.3.0" + } + } + # Update with webpack scripts for building all js packages + for lib_name in js_libs_dict: + for version in js_libs_dict.get(lib_name).get('versions'): + js_libs_package_dict.get('scripts').update({ + f'webpack-{lib_name}-v{version}': f'webpack --config {lib_name}-v{version}-webpack.config.js' + }) + + with open('./package.json', 'w') as js_libs_package_json: + js_libs_package_json.write(json.dumps(js_libs_package_dict, indent=2)) + + +def generate_webpack_configs(): + """ + This dynamically generates all the webpack config files needed + to build the single-file js libs which are specified in the + udfs/js_libs/js_libs.yaml file. + See https://webpack.js.org/concepts/configuration/ for more information + on webpack config files. + """ + node_modules_path = Path('./node_modules') + npm_package_config_paths = get_all_npm_package_config_paths( + node_modules_path) + for npm_package_config_path in npm_package_config_paths: + with open(npm_package_config_path) as npm_package_config: + npm_package_json = json.loads(npm_package_config.read()) + # Check for js main entrypoint + # https://docs.npmjs.com/cli/v6/configuring-npm/package-json#main + # If no main entrypoint found, check for a single dependency file + # https://docs.npmjs.com/cli/v6/configuring-npm/package-json#files + js_main_entrypoint = npm_package_json.get('main') + js_dependency_files = npm_package_json.get('files') + js_lib_name = npm_package_json.get('name') + js_lib_version = npm_package_json.get('version') + if js_main_entrypoint is not None: + js_main_entrypoint_path = npm_package_config_path.parent / Path( + js_main_entrypoint) + elif len(js_dependency_files) == 1: + js_main_entrypoint_path = npm_package_config_path.parent / Path( + js_dependency_files[0]) + webpack_config_file_path = Path( + f'{npm_package_config_path.parent.name}-webpack.config.js') + minimize_js = True if js_lib_name not in NO_MINIFY_JS_LIBS else False + js_lib_file_extension = ".min.js" if minimize_js else ".js" + with open(webpack_config_file_path, 'w') as webpack_config: + webpack_config.write( + f'var path = require("path");\n' + f'module.exports = {{\n' + f' entry: "./{js_main_entrypoint_path}",\n' + f' output: {{\n' + f' path: path.resolve(__dirname, "js_builds"),\n' + f' filename: "{js_lib_name}-v{js_lib_version}{js_lib_file_extension}",\n' + f' library: "{js_lib_name.replace("-", "_")}",\n' + f' libraryTarget: "var",\n' + f' }},\n' + f' optimization: {{\n' + f' minimize: {"true" if minimize_js else "false"}\n' + f' }},\n' + f' mode: "production",\n' + f'}};') + + def main(): - parser = argparse.ArgumentParser(description='Utils Class to support testing BigQuery UDFs') - parser.add_argument('--create_test_datasets', help='Create test datasets used for UDF function testing.', - action='store_true') - parser.add_argument('--delete_test_datasets', help='Delete test datasets used for UDF function testing.', - action='store_true') + parser = argparse.ArgumentParser( + description='Utils Class to support testing BigQuery UDFs') + parser.add_argument( + '--create-prod-datasets', + help='Create prod datasets used for UDF function testing.', + action='store_true') + parser.add_argument( + '--create-test-datasets', + help='Create test datasets used for UDF function testing.', + action='store_true') + parser.add_argument( + '--delete-test-datasets', + help='Delete test datasets used for UDF function testing.', + action='store_true') + parser.add_argument( + '--generate-js-libs-package-json', + help='Generate package.json file necessary for building ' + 'javascript libs for BigQuery UDFs', + action='store_true') + parser.add_argument( + '--generate-webpack-configs', + help='Generate webpack config files necessary for building ' + 'javascript libs for BigQuery UDFs', + action='store_true') args = parser.parse_args() - client = bigquery.Client() - if args.create_test_datasets: - create_datasets(client) + + bq_project_id = os.getenv('BQ_PROJECT_ID') + if args.create_prod_datasets: + create_datasets(bigquery.Client(project=bq_project_id)) + elif args.create_test_datasets: + create_datasets(bigquery.Client(project=bq_project_id), + dataset_suffix=DATASET_SUFFIX) elif args.delete_test_datasets: - delete_datasets(client) + delete_datasets(bigquery.Client(project=bq_project_id)) + elif args.generate_js_libs_package_json: + generate_js_libs_package_json() + elif args.generate_webpack_configs: + generate_webpack_configs() if __name__ == '__main__': - main() + main() \ No newline at end of file From d4cc03912c5bf0607a6a26b15b408cb8f7d36efb Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Fri, 12 Feb 2021 21:13:26 +0000 Subject: [PATCH 022/104] fixed newline --- udfs/tests/udf_test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/tests/udf_test_utils.py b/udfs/tests/udf_test_utils.py index b7f7bd483..d2bae32c4 100644 --- a/udfs/tests/udf_test_utils.py +++ b/udfs/tests/udf_test_utils.py @@ -323,4 +323,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() From 6705fc55974dc26c42d1d3c51a910adb6d07f02f Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 09:59:13 -0500 Subject: [PATCH 023/104] Delete BigQueryUtils.code-workspace --- BigQueryUtils.code-workspace | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 BigQueryUtils.code-workspace diff --git a/BigQueryUtils.code-workspace b/BigQueryUtils.code-workspace deleted file mode 100644 index 8aba6a2ab..000000000 --- a/BigQueryUtils.code-workspace +++ /dev/null @@ -1,7 +0,0 @@ -{ - "folders": [ - { - "path": "." - } - ] -} \ No newline at end of file From c668b7fa13cf5eaffea1788eb94aca75551fa20b Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 10:05:46 -0500 Subject: [PATCH 024/104] Update udfs/community/pvalue.sql Co-authored-by: Daniel De Leo --- udfs/community/pvalue.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/pvalue.sql b/udfs/community/pvalue.sql index 0d73ba2a1..413b7844e 100644 --- a/udfs/community/pvalue.sql +++ b/udfs/community/pvalue.sql @@ -3,7 +3,7 @@ CREATE OR REPLACE FUNCTION fn.pvalue(H FLOAT64, dof INT64) RETURNS FLOAT64 LANGUAGE js AS """ - return 1.0 - jStat['chisquare'].cdf(H, dof) + return 1.0 - jstat.jStat['chisquare'].cdf(H, dof) """ OPTIONS ( library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] From befd86432b982d363b6e3c9690c35fe412a50388 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 10:11:07 -0500 Subject: [PATCH 025/104] Update udfs/community/linear_regression.sql Co-authored-by: Daniel De Leo --- udfs/community/linear_regression.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/udfs/community/linear_regression.sql b/udfs/community/linear_regression.sql index 3b5cda234..d57a00b37 100644 --- a/udfs/community/linear_regression.sql +++ b/udfs/community/linear_regression.sql @@ -14,8 +14,6 @@ * limitations under the License. */ - - CREATE OR REPLACE FUNCTION fn.linear_regression(data ARRAY>) AS (( WITH results AS ( From ad2accc99e7d08d34ce6f12fad64cf3c82d6bc76 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 10:22:15 -0500 Subject: [PATCH 026/104] Update udfs/community/linear_regression.sql Co-authored-by: Daniel De Leo --- udfs/community/linear_regression.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/linear_regression.sql b/udfs/community/linear_regression.sql index d57a00b37..79af591b4 100644 --- a/udfs/community/linear_regression.sql +++ b/udfs/community/linear_regression.sql @@ -1,5 +1,5 @@ /* - * Copyright 2019 Google LLC + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 00690a984718dd6771142fac258571c32c733a3f Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 10:22:25 -0500 Subject: [PATCH 027/104] Update udfs/community/kruskal_wallis.sql Co-authored-by: Daniel De Leo --- udfs/community/kruskal_wallis.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/kruskal_wallis.sql b/udfs/community/kruskal_wallis.sql index 22d55f6cc..a385bd4c9 100644 --- a/udfs/community/kruskal_wallis.sql +++ b/udfs/community/kruskal_wallis.sql @@ -1,5 +1,5 @@ /* - * Copyright 2019 Google LLC + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 4b54467cad03b93a33b9dfe882a9cddcdf2b003d Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 10:23:05 -0500 Subject: [PATCH 028/104] Update udfs/community/README.md Co-authored-by: Daniel De Leo --- udfs/community/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 420f51dc6..844c7715e 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -593,10 +593,10 @@ returns: # StatsLib: Statistical UDFs -This directory contains community contributed [user-defined functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions) for Statistical Analysis -to extend BigQuery for more specialized usage patterns. Each UDF within this -directory will be automatically synchronized to the `bqutil` project within the -`fn` dataset for reference in queries. +This section details the subset of community contributed [user-defined functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions) +that extend BigQuery and enable more specialized Statistical Analysis usage patterns. +Each UDF detailed below will be automatically synchronized to the `fn` dataset +within the `bqutil` project for reference in your queries. For example, if you'd like to reference the `int` function within your query, you can reference it like the following: @@ -643,4 +643,4 @@ results: | results.H | results.p | results.DoF | |-----------|-----------|-------------| -| 3.4230769 | 0.1805877 | 2 | \ No newline at end of file +| 3.4230769 | 0.1805877 | 2 | From b22b52e5ac28b3f4db3041f8455f115663068fd6 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 10:23:57 -0500 Subject: [PATCH 029/104] Update udfs/community/README.md Co-authored-by: Daniel De Leo --- udfs/community/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 844c7715e..fe9285703 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -610,7 +610,7 @@ SELECT bqutil.fn.int(1.684) ## Documentation -### [kruskal_wallis(arr(struct(factor STRING, val FLOAT64))](kruskal_wallis.sql) +### [kruskal_wallis(ARRAY(STRUCT(factor STRING, val FLOAT64))](kruskal_wallis.sql) Takes an array of struct where each struct (point) represents a measurement, with a group label and a measurement value The [Kruskal–Wallis test by ranks](https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance), Kruskal–Wallis H test (named after William Kruskal and W. Allen Wallis), or one-way ANOVA on ranks is a non-parametric method for testing whether samples originate from the same distribution. It is used for comparing two or more independent samples of equal or different sample sizes. It extends the Mann–Whitney U test, which is used for comparing only two groups. The parametric equivalent of the Kruskal–Wallis test is the one-way analysis of variance (ANOVA). From bd0c6dd1d9442b1cd73b8fb3300ac2cd8b74a094 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Mon, 22 Feb 2021 10:24:22 -0500 Subject: [PATCH 030/104] Update udfs/community/README.md Co-authored-by: Daniel De Leo --- udfs/community/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index fe9285703..a20099512 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -636,7 +636,7 @@ set data = [ ]; -SELECT `lib_stats.kruskal_wallis`(data) as results; +SELECT `lib_stats.kruskal_wallis`(data) AS results; ``` results: From 6095ed26943eb9c7a9e7fa838cf0ac3a45025914 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 25 Feb 2021 10:08:43 -0500 Subject: [PATCH 031/104] Update README.md --- udfs/community/README.md | 48 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index a20099512..5b006db8c 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -636,7 +636,7 @@ set data = [ ]; -SELECT `lib_stats.kruskal_wallis`(data) AS results; +SELECT `bqutil.fn.kruskal_wallis`(data) AS results; ``` results: @@ -644,3 +644,49 @@ results: | results.H | results.p | results.DoF | |-----------|-----------|-------------| | 3.4230769 | 0.1805877 | 2 | + + + +### [linear_regression(ARRAY(STRUCT(STRUCT(X FLOAT64, Y FLOAT64))](linear_regression.sql) +Takes an array of STRUCT X, Y and returns _a, b, r_ where _Y = a*X + b_, and _r_ is the "goodness of fit measure. + +The [Linear Regression](https://en.wikipedia.org/wiki/Linear_regression), is a linear approach to modelling the relationship between a scalar response and one or more explanatory variables (also known as dependent and independent variables). + +* Input: array: struct +* Output: struct +* +```sql +DECLARE data ARRAY>; +set data = [ (5.1,2.5), (5.0,2.0), (5.7,2.6), (6.0,2.2), (5.8,2.6), (5.5,2.3), (6.1,2.8), (5.5,2.5), (6.4,3.2), (5.6,3.0)]; +SELECT `bqutils.fn.linear_regression`(data) AS results; +``` + +results: + + +| results.a | results.b | results.r | +|---------------------|--------------------|-------------------| +| -0.4353361094588436 | 0.5300416418798544 | 0.632366563565354 | + + + + +### [pvalue(H FLOAT64, dof FLOAT64)](pvalue.sql) +Takes _H_ and _dof_ and returns _p_ probability value. + +The [pvalue](https://jstat.github.io/distributions.html#jStat.chisquare.cdf) is NULL Hypothesis probability of the Kruskal-Wallis (KW) test. This is obtained to be the CDF of the chisquare with the _H_ value and the Degrees of Freedom (_dof_) of the KW problem. + +* Input: H FLOAT64, dof FLOAT64 +* Output: p FLOAT64 +* +```sql +SELECT `bqutils.fn.pvalue`(.3,2) AS results; +``` + +results: + + +| results | +|-------------------| +|0.8607079764250578 | + From 67d21f13ce5040aca8a2f721f2ea3efb29aa29b5 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 25 Feb 2021 10:16:29 -0500 Subject: [PATCH 032/104] Update README.md --- udfs/community/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/README.md b/udfs/community/README.md index 5b006db8c..02d92feb8 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -20,6 +20,7 @@ SELECT bqutil.fn.int(1.684) * [get_value](#get_valuek-string-arr-any-type) * [int](#intv-any-type) * [json_typeof](#json_typeofjson-string) +* [kruskal_wallis](#kruskal_wallisarraystructfactor-string-val-float64) * [last_day](#lastdaydt-date) * [linear_interpolate](#linear_interpolatepos-int64-prev-structx-int64-y-float64-next-structx-int64-y-float64) * [median](#medianarr-any-type) From d294925da39cc092b4dd6a5e9b59a6860e546967 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 25 Feb 2021 10:34:24 -0500 Subject: [PATCH 033/104] Update README.md --- udfs/community/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udfs/community/README.md b/udfs/community/README.md index 02d92feb8..0b3375224 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -23,11 +23,13 @@ SELECT bqutil.fn.int(1.684) * [kruskal_wallis](#kruskal_wallisarraystructfactor-string-val-float64) * [last_day](#lastdaydt-date) * [linear_interpolate](#linear_interpolatepos-int64-prev-structx-int64-y-float64-next-structx-int64-y-float64) +* [linear_regression](#linear_regressionarraystructstructx-float64-y-float64) * [median](#medianarr-any-type) * [nlp_compromise_number](#nlp_compromise_numberstr-string) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) * [percentage_change](#percentage_changeval1-float64-val2-float64) * [percentage_difference](#percentage_differenceval1-float64-val2-float64) +* [pvalue](#pvalueh-float64-dof-float64) * [radians](#radiansx-any-type) * [random_int](#random_intmin-any-type-max-any-type) * [random_value](#random_valuearr-any-type) From dbac294e8b7b0c8c0ad15823b4bf1bd6136c5abd Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 25 Feb 2021 11:08:49 -0500 Subject: [PATCH 034/104] Update test_cases.yaml --- udfs/community/test_cases.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index a41816c9e..22c8c8e78 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -403,4 +403,5 @@ kruskal_wallis: linear_regression: - test: input: (SELECT [ (5.1,2.5), (5.0,2.0), (5.7,2.6), (6.0,2.2), (5.8,2.6), (5.5,2.3), (6.1,2.8), (5.5,2.5), (6.4,3.2), (5.6,3.0)]) - expected_output: STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r) \ No newline at end of file + expected_output: STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r) + From 9d762e45215c44ff474d9cd0778ef284be2b8483 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Mon, 26 Apr 2021 19:48:28 -0700 Subject: [PATCH 035/104] Function to compute p-values from correlations --- udfs/community/corr_pvalue.sql | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 udfs/community/corr_pvalue.sql diff --git a/udfs/community/corr_pvalue.sql b/udfs/community/corr_pvalue.sql new file mode 100644 index 000000000..6e71833db --- /dev/null +++ b/udfs/community/corr_pvalue.sql @@ -0,0 +1,18 @@ +-- corr_pvalue: +-- Input: +-- r: correlation value, n : number of samples +-- Output: The p value of the correlation +CREATE OR REPLACE FUNCTION fn.corr_pvalue(r FLOAT64, n INT64 ) +RETURNS FLOAT64 +LANGUAGE js AS """ +var abs_r = Math.abs(r) +if ( abs_r < 1.0 ) { + var t = abs_r * Math.sqrt( (n-2) / (1.0 - (r*r)) ) + return jstat.jStat.ttest(t,n-2,2); +} else if (abs_r == 1.0 ) { + return 0.0 +} else { + return NaN +} +""" +OPTIONS (library=["${JS_BUCKET}/jstat-v1.9.4.min.js"]); From 6d891730c947027ee5c05c1232389c58af4e8d64 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Mon, 10 May 2021 06:01:00 +0000 Subject: [PATCH 036/104] Additional UDFs --- udfs/community/mannwhitneyu.sql | 40 +++++++++++ udfs/community/normal_cdf.sql | 8 +++ udfs/community/p_fisherexact.sql | 114 +++++++++++++++++++++++++++++++ udfs/community/test_cases.yaml | 28 +++++++- 4 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 udfs/community/mannwhitneyu.sql create mode 100644 udfs/community/normal_cdf.sql create mode 100644 udfs/community/p_fisherexact.sql diff --git a/udfs/community/mannwhitneyu.sql b/udfs/community/mannwhitneyu.sql new file mode 100644 index 000000000..33681eebe --- /dev/null +++ b/udfs/community/mannwhitneyu.sql @@ -0,0 +1,40 @@ +--Computes the U statistics and the p value of the Mann–Whitney U test (also called Mann–Whitney–Wilcoxon) +--inputs: x,y (arrays of samples, both should be one-dimensional, type: ARRAY ) +-- alt (Defines the alternative hypothesis. The following options are available: 'two-sided', 'less', and 'greater' +CREATE OR REPLACE FUNCTION fn.mannwhitneyu(x ARRAY, y ARRAY, alt STRING) +AS ( +( + WITH statistics as ( + WITH summ as ( + WITH mydata as( + SELECT true as label, xi as val + FROM UNNEST( x ) as xi + UNION ALL + SELECT false as label, yi as val + FROM UNNEST( y ) as yi + ) + SELECT label, (RANK() OVER(ORDER BY val ASC))+(COUNT(*) OVER (PARTITION BY CAST(val AS STRING)) - 1)/2.0 AS r + FROM mydata + ) + SELECT sum(r) - ARRAY_LENGTH(x)*(ARRAY_LENGTH(x)+ 1)/2.0 as U2, + ARRAY_LENGTH(x)*(ARRAY_LENGTH(y) + (ARRAY_LENGTH(x)+ 1)/2.0) - sum(r) as U1, + ARRAY_LENGTH(x) * ARRAY_LENGTH(y) as n1n2, + SQRT(ARRAY_LENGTH(x) * ARRAY_LENGTH(y) *(ARRAY_LENGTH(x)+ARRAY_LENGTH(y)+1)/12.0 ) as den, + FROM summ + WHERE label + ), + normal_appr as ( + SELECT + CASE alt + WHEN 'less' THEN ( n1n2/2.0 +0.5 - U1)/den + WHEN 'greater' THEN (n1n2/2.0 +0.5 - U2)/den + ELSE -1.0*ABS(n1n2/2.0 + 0.5 - IF(U1[2, 4, 6, 2, 3, 7, 5, 1.], ARRAY[8, 10, 11, 14, 20, 18, 19, 9.], CAST('two-sided' AS STRING) + expected_output: STRUCT(CAST(0.0 AS FLOAT64) AS U, CAST(9.391056991171487E-4 AS FLOAT64) AS p) From 844e02097b8757eb919cd29bc2227ec273a22599 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 11 May 2021 12:14:31 -0700 Subject: [PATCH 037/104] fix(test): handle jslibs bundling, reference in local testing --- udfs/cloudbuild_js_libs.yaml | 66 ++++++++++++++++++++++++++++++++++ udfs/js_libs/README.md | 13 +++++++ udfs/tests/test_create_udfs.py | 1 + udfs/tests/udf_test_utils.py | 9 +++++ 4 files changed, 89 insertions(+) create mode 100644 udfs/cloudbuild_js_libs.yaml diff --git a/udfs/cloudbuild_js_libs.yaml b/udfs/cloudbuild_js_libs.yaml new file mode 100644 index 000000000..55ce5b8bf --- /dev/null +++ b/udfs/cloudbuild_js_libs.yaml @@ -0,0 +1,66 @@ +# Builds js_libs to your own GCS bucket for testing +# gcloud builds submit . --config=build_js_libs.yaml --substitutions=_JS_BUCKET="gs://YOUR_GS_BUCKET[/optional_sub_path]" +# + +steps: + ############################################################ + # Dynamically create the package.json file based off the libs + # specified in the js_libs/js_libs.yaml file. + ############################################################ +- name: gcr.io/bqutil/bq_udf_ci + id: generate_js_libs_package_json + entrypoint: python3 + args: + - tests/udf_test_utils.py + - --generate-js-libs-package-json + ########################################################### + # Install npm packages based off the package.json file + # created in the previous step. + ########################################################### +- name: node + id: install_npm_packages + entrypoint: npm + args: + - install + ############################################################ + # Dynamically create webpack config files needed by webpack + # to build npm packages into single .js files which will be + # hosted on GCS and used by BigQuery UDFs. + ############################################################ +- name: gcr.io/bqutil/bq_udf_ci + id: generate_webpack_configs + entrypoint: python3 + args: + - tests/udf_test_utils.py + - --generate-webpack-configs + waitFor: + - install_npm_packages + - generate_js_libs_package_json + ########################################################### + # Build (via webpack) all js libraries for BigQuery UDFs + ########################################################### +- name: node + id: build_bq_js_libs + entrypoint: npm + args: + - run-script + - build-all-libs + waitFor: + - generate_webpack_configs + - install_npm_packages + ########################################################### + # Copy all libs to GCS bucket + ########################################################### +- name: gcr.io/google.com/cloudsdktool/cloud-sdk + id: copy_js_to_gcs + entrypoint: gsutil + args: + - '-m' + - cp + - js_builds/* + - ${_JS_BUCKET} + waitFor: + - build_bq_js_libs +timeout: 1800s # 30 minutes +options: + machineType: N1_HIGHCPU_32 diff --git a/udfs/js_libs/README.md b/udfs/js_libs/README.md index 3353b158f..5adaa150b 100644 --- a/udfs/js_libs/README.md +++ b/udfs/js_libs/README.md @@ -21,3 +21,16 @@ CREATE FUNCTION myFunc(a FLOAT64, b STRING) **Note: When your UDF makes a call to the javascript library, make sure to convert any dashes '-' to underscores '_' in the javascript library name.** (e.g. LIBRARY-NAME in example above is invoked as LIBRARY_NAME in the UDF body.) + +## Testing js_libs locally + +If you'd like to add (or update the version of) a javascript library, you can build js_libs via cloudbuild and upload them to your own GCS bucket. +After updating [js_libs.yaml](js_libs.yaml), execute the cloudbuild_js_libs build, substituting a GCS bucket within your test project accordingly: +```bash +gcloud builds submit . --config=cloudbuild_js_libs.yaml --substitutions=_JS_BUCKET="gs://YOUR_GCS_BUCKET[/optional_sub_path] +``` +After the libraries have been built, you'll need to provide the `_JS_BUCKET` environment variable when running your local UDF tests. +For example: +```bash +_JS_BUCKET="gs://YOUR_GCS_BUCKET[/optional_sub_path]" bash tests/run.sh -k kruskal_wallis +``` diff --git a/udfs/tests/test_create_udfs.py b/udfs/tests/test_create_udfs.py index 0124e0670..538f78e85 100644 --- a/udfs/tests/test_create_udfs.py +++ b/udfs/tests/test_create_udfs.py @@ -42,6 +42,7 @@ def test_create_udf(self, udf_path): try: with open(udf_path) as udf_file: udf_sql = udf_file.read() + udf_sql = utils.replace_with_js_bucket(os.getenv('_JS_BUCKET') or 'gs://bqutil-lib/bq_js_libs', udf_sql) # Only replace UDF datasets with a test dataset if the # build was triggered by a pull request or a non-main branch if (os.getenv('_PR_NUMBER') is not None or diff --git a/udfs/tests/udf_test_utils.py b/udfs/tests/udf_test_utils.py index d2bae32c4..a8a171670 100644 --- a/udfs/tests/udf_test_utils.py +++ b/udfs/tests/udf_test_utils.py @@ -143,6 +143,15 @@ def replace_with_null_body(udf_path): else: return None +def replace_with_js_bucket(js_bucket, udf_sql): + """ + Replaces the JS_BUCKET placeholder in UDFs, when present + + :param js_bucket: js_bucket for the bundled js_libs files + :param udf_sql: SQL DDL of the UDF + """ + + return udf_sql.replace('${JS_BUCKET}', js_bucket) def replace_with_test_datasets(project_id, udf_sql): """ From 11efd37eb3496b40eb0c7aac651662440542856f Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 11 May 2021 21:36:49 +0000 Subject: [PATCH 038/104] Adding info about testing js_libs locally. --- udfs/CONTRIBUTING.md | 5 +++++ udfs/js_libs/README.md | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/udfs/CONTRIBUTING.md b/udfs/CONTRIBUTING.md index 647be97c4..47a61431a 100644 --- a/udfs/CONTRIBUTING.md +++ b/udfs/CONTRIBUTING.md @@ -58,6 +58,11 @@ the pull request can ensure a successful review process. Please follow these instructions to confirm that your test cases work as expected. +> **WARNING:** Some UDFs require the use of Javascript libraries. The unit tests on +> these UDFs will fail during local testing if the JS libraries are not +> created locally. You can ignore these failures if you don't care about Javascript libraries. +> Or, **to avoid these failures**, follow the steps [here](js_libs/README.md#testing-js_libs-locally) + 1. Change into the bigquery_utils [udfs/](./) directory: * `cd udfs/` diff --git a/udfs/js_libs/README.md b/udfs/js_libs/README.md index 5adaa150b..56c0d9732 100644 --- a/udfs/js_libs/README.md +++ b/udfs/js_libs/README.md @@ -32,5 +32,6 @@ gcloud builds submit . --config=cloudbuild_js_libs.yaml --substitutions=_JS_BUCK After the libraries have been built, you'll need to provide the `_JS_BUCKET` environment variable when running your local UDF tests. For example: ```bash -_JS_BUCKET="gs://YOUR_GCS_BUCKET[/optional_sub_path]" bash tests/run.sh -k kruskal_wallis +_JS_BUCKET="gs://YOUR_GCS_BUCKET[/optional_sub_path]" +bash tests/run.sh -k kruskal_wallis ``` From ac48d5a6eaa723b9ed9d09d240749f74dd8c0f4b Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Wed, 12 May 2021 13:30:53 +0000 Subject: [PATCH 039/104] Removed typo --- udfs/community/mannwhitneyu.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/mannwhitneyu.sql b/udfs/community/mannwhitneyu.sql index 33681eebe..8b176e159 100644 --- a/udfs/community/mannwhitneyu.sql +++ b/udfs/community/mannwhitneyu.sql @@ -34,7 +34,7 @@ AS ( IF( alt='less' OR alt='greater', 1.0, 2.0 ) as factor FROM statistics ) - SELECT struct(U, factor* fn.normal_cdf`(z,0.0,1.0) as p ) + SELECT struct(U, factor* fn.normal_cdf(z,0.0,1.0) as p ) FROM normal_appr ) ); From d56baa5932e79d72133fa8e25a0381b7053c6be1 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 8 Jun 2021 10:53:12 -0700 Subject: [PATCH 040/104] fix(tests): remove NaN test, property reference for JS methods --- udfs/community/corr_pvalue.sql | 6 +++--- udfs/community/test_cases.yaml | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/udfs/community/corr_pvalue.sql b/udfs/community/corr_pvalue.sql index 6e71833db..dc51c4a46 100644 --- a/udfs/community/corr_pvalue.sql +++ b/udfs/community/corr_pvalue.sql @@ -5,10 +5,10 @@ CREATE OR REPLACE FUNCTION fn.corr_pvalue(r FLOAT64, n INT64 ) RETURNS FLOAT64 LANGUAGE js AS """ -var abs_r = Math.abs(r) +var abs_r = Math['abs'](r) if ( abs_r < 1.0 ) { - var t = abs_r * Math.sqrt( (n-2) / (1.0 - (r*r)) ) - return jstat.jStat.ttest(t,n-2,2); + var t = abs_r * Math['sqrt']( (n-2) / (1.0 - (r*r)) ) + return jstat['jStat']['ttest'](t,n-2,2); } else if (abs_r == 1.0 ) { return 0.0 } else { diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index b081f5f2e..753550467 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -439,9 +439,6 @@ corr_pvalue: - test: input: CAST(0.0 AS FLOAT64), CAST(50 AS INT64) expected_output: CAST(1.0000000078386753 AS FLOAT64) - - test: - input: CAST(1.1 AS FLOAT64), CAST(40 AS INT64) - expected_output: CAST(NaN AS FLOAT64) p_fisherexact: - test: input: CAST(90 AS FLOAT64), CAST(27 AS FLOAT64), CAST(17 AS FLOAT64), CAST(50 AS FLOAT64) From 6b34d8b1cedaefd88bd58b39d0a7c7b46a504de5 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Tue, 22 Jun 2021 04:11:52 +0000 Subject: [PATCH 041/104] isb-cgc functions fixed --- udfs/community/normal_cdf.sql | 2 +- udfs/community/p_fisherexact.sql | 16 ++++++++-------- udfs/community/test_cases.yaml | 6 +++++- udfs/tests/run.sh | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/udfs/community/normal_cdf.sql b/udfs/community/normal_cdf.sql index 4204f3690..2f9f01d68 100644 --- a/udfs/community/normal_cdf.sql +++ b/udfs/community/normal_cdf.sql @@ -1,7 +1,7 @@ CREATE OR REPLACE FUNCTION fn.normal_cdf(x FLOAT64, mean FLOAT64, std FLOAT64) RETURNS FLOAT64 LANGUAGE js AS """ - return jstat.jStat.normal.cdf( x, mean, std ) + return jstat['jStat']['normal']['cdf']( x, mean, std ) """ OPTIONS ( library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] diff --git a/udfs/community/p_fisherexact.sql b/udfs/community/p_fisherexact.sql index 9fae580c3..47cd6414f 100644 --- a/udfs/community/p_fisherexact.sql +++ b/udfs/community/p_fisherexact.sql @@ -27,7 +27,7 @@ function lngamm(z) x -= 1259.139216722289 /(z+1); x += 676.5203681218835 /(z); x += 0.9999999999995183; - return(Math.log(x)-5.58106146679532777-z+(z-0.5)*Math.log(z+6.5)); + return(Math['log'](x)-5.58106146679532777-z+(z-0.5)*Math['log'](z+6.5)); } function lnfact(n) { @@ -67,7 +67,7 @@ function LynHyperGoe_appr( a, b, c, d ) { } - var n = Math.round(a + b + c + d); + var n = Math['round'](a + b + c + d); var temp = 0 ; var LnPrx = 0.0 ; var n1_ = a+b ; @@ -87,28 +87,28 @@ function LynHyperGoe_appr( a, b, c, d ) { for (x = min; x <= max; x++) { LnPrx = LnHyperGeometric( x , n1_ - x, n_1 - x, n_21 +x) ; if ( LnPrx <= LnPra ) { - temp = temp + Math.exp( LnPrx - LnPra ); + temp = temp + Math['exp']( LnPrx - LnPra ); } } } else { var LnPra = LynHyperGoe_appr( a, b, c, d) ; LnPrx = (min, n1_-min, n_1-min, n_21+min); if ( LnPrx <= LnPra ) { - temp = temp + Math.exp( LnPrx - LnPra ); + temp = temp + Math['exp']( LnPrx - LnPra ); } for (x = min+1; x <= max; x++) { if (! (x % 10 == 0) ) { - LnPrx = LnPrx + Math.log( ((n_1 - x +1)/x)*(n1_ -x +1)/( n_21 + x) ) ; + LnPrx = LnPrx + Math['log']( ((n_1 - x +1)/x)*(n1_ -x +1)/( n_21 + x) ) ; } else { LnPrx = LynHyperGoe_appr(x, n1_-x, n_1-x, n_21+x); } if ( LnPrx <= LnPra ) { - temp = temp + Math.exp( LnPrx - LnPra ); + temp = temp + Math['exp']( LnPrx - LnPra ); } } } - var LnPFET = LnPra + Math.log( temp ); - return Math.exp( LnPFET ) ; + var LnPFET = LnPra + Math['log']( temp ); + return Math['exp']( LnPFET ) ; """; diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index 753550467..de7d695f8 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -439,6 +439,10 @@ corr_pvalue: - test: input: CAST(0.0 AS FLOAT64), CAST(50 AS INT64) expected_output: CAST(1.0000000078386753 AS FLOAT64) +normal_cdf: + - test: + input: CAST(0.0 AS FLOAT64), CAST(0.0 AS FLOAT64), CAST(1.0 AS FLOAT64) + expectd_output: CAST(0.5 AS FLOAT64) p_fisherexact: - test: input: CAST(90 AS FLOAT64), CAST(27 AS FLOAT64), CAST(17 AS FLOAT64), CAST(50 AS FLOAT64) @@ -446,4 +450,4 @@ p_fisherexact: mannwhitneyu: - test: input: ARRAY[2, 4, 6, 2, 3, 7, 5, 1.], ARRAY[8, 10, 11, 14, 20, 18, 19, 9.], CAST('two-sided' AS STRING) - expected_output: STRUCT(CAST(0.0 AS FLOAT64) AS U, CAST(9.391056991171487E-4 AS FLOAT64) AS p) + expected_output: STRUCT(CAST(64.0 AS FLOAT64) AS U, CAST(9.391056991171487E-4 AS FLOAT64) AS p) diff --git a/udfs/tests/run.sh b/udfs/tests/run.sh index 53b1d21b4..6e8e0705e 100755 --- a/udfs/tests/run.sh +++ b/udfs/tests/run.sh @@ -19,4 +19,4 @@ python3 tests/udf_test_utils.py --create-test-datasets python3 -m pytest --workers 100 tests/create_udf_signatures.py "$@" python3 -m pytest --workers 100 tests/test_create_udfs.py "$@" python3 -m pytest --workers 100 tests/test_run_udfs.py "$@" -python3 tests/udf_test_utils.py --delete-test-datasets \ No newline at end of file +python3 tests/udf_test_utils.py --delete-test-datasets From fd4c19d66a7225de9a37ed0c809e8e8839665cc6 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 22 Jun 2021 16:00:41 -0400 Subject: [PATCH 042/104] feat: implement jstat_udf --- udfs/community/jstat.sql | 14 ++++++++++++++ udfs/community/test_cases.yaml | 4 ++++ 2 files changed, 18 insertions(+) create mode 100644 udfs/community/jstat.sql diff --git a/udfs/community/jstat.sql b/udfs/community/jstat.sql new file mode 100644 index 000000000..37f4b1e3f --- /dev/null +++ b/udfs/community/jstat.sql @@ -0,0 +1,14 @@ +CREATE OR REPLACE FUNCTION fn.jstat(method STRING, args ARRAY) +RETURNS FLOAT64 +LANGUAGE js AS """ + const methodPath = method['split']('.') + let fn = jstat['jStat'] + for (const name of methodPath){ + fn = fn[name] + } + + return fn(...args) +""" +OPTIONS ( + library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] +); diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index ff7d68c40..1e7e54790 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -411,6 +411,10 @@ levenshtein: # # Below targets StatsLib work # +jstat: + - test: + input: CAST('chisquare.cdf' AS STRING), ARRAY[0.3, 2.0] + expected_output: CAST(0.1392920235749422 AS FLOAT64) pvalue: - test: input: CAST(0.3 AS FLOAT64), CAST(2 AS INT64) From f230b4eacb72dc339518259b45d793a407391ce4 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 20 Jul 2021 16:31:37 +0000 Subject: [PATCH 043/104] Trying to fix mann_whitney and normal_cdf --- .gitignore | 1 + udfs/community/mannwhitneyu.sql | 4 ++-- udfs/community/test_cases.yaml | 2 +- udfs/tests/run.sh | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 2e4597c8e..a306cf519 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ hs_err_pid* .vscode .idea/ target/ +udfs/tests/run.sh diff --git a/udfs/community/mannwhitneyu.sql b/udfs/community/mannwhitneyu.sql index 49bf47b4d..7c116c2f5 100644 --- a/udfs/community/mannwhitneyu.sql +++ b/udfs/community/mannwhitneyu.sql @@ -33,7 +33,7 @@ AS ( U1 as U, IF( alt='less' OR alt='greater', 1.0, 2.0 ) as factor FROM statistics +) SELECT struct(U, factor* fn.normal_cdf(z,0.0,1.0) as p ) FROM normal_appr -) -); +)); \ No newline at end of file diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index b47e028e7..4600af39e 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -446,7 +446,7 @@ corr_pvalue: normal_cdf: - test: input: CAST(0.0 AS FLOAT64), CAST(0.0 AS FLOAT64), CAST(1.0 AS FLOAT64) - expectd_output: CAST(0.5 AS FLOAT64) + expected_output: CAST(0.5 AS FLOAT64) p_fisherexact: - test: diff --git a/udfs/tests/run.sh b/udfs/tests/run.sh index 6e8e0705e..2327394c5 100755 --- a/udfs/tests/run.sh +++ b/udfs/tests/run.sh @@ -19,4 +19,4 @@ python3 tests/udf_test_utils.py --create-test-datasets python3 -m pytest --workers 100 tests/create_udf_signatures.py "$@" python3 -m pytest --workers 100 tests/test_create_udfs.py "$@" python3 -m pytest --workers 100 tests/test_run_udfs.py "$@" -python3 tests/udf_test_utils.py --delete-test-datasets +#python3 tests/udf_test_utils.py --delete-test-datasets From d31de79a3ab2cf36e8192656a6d4554f31c147e5 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 20 Jul 2021 19:29:46 +0000 Subject: [PATCH 044/104] Added t_test; Student's T --- udfs/community/t_test.sql | 37 ++++++++++++++++++++++++++++++++++ udfs/community/test_cases.yaml | 5 ++++- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 udfs/community/t_test.sql diff --git a/udfs/community/t_test.sql b/udfs/community/t_test.sql new file mode 100644 index 000000000..ed81d7c53 --- /dev/null +++ b/udfs/community/t_test.sql @@ -0,0 +1,37 @@ +/* + * Copyright 2019 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- Student's T-Test +-- Input: + -- pop1, array of FLOAT64, values from first group + -- pop2, array of FLOAT64, values from second group +-- Output: + -- Struct of t_value and DOF + + +CREATE OR REPLACE FUNCTION fn.t_test(pop1 ARRAY, pop2 ARRAY) AS (( + + WITH pop1 AS ( + SELECT AVG(p1) x1, STDDEV(p1) st1, COUNT(p1) AS n1 FROM UNNEST(pop1) AS p1 + ), + pop2 as ( + SELECT AVG(p2) x2, STDDEV(p2) st2, COUNT(p2) AS n2 FROM UNNEST(pop2) AS p2 + ) + SELECT + STRUCT(ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2/ n2)) AS t_value, n1+n2-2 AS dof) + FROM pop1 CROSS JOIN pop2 + +)); \ No newline at end of file diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index 4600af39e..f2e85fae3 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -456,4 +456,7 @@ mannwhitneyu: - test: input: ARRAY[2, 4, 6, 2, 3, 7, 5, 1.], ARRAY[8, 10, 11, 14, 20, 18, 19, 9.], CAST('two-sided' AS STRING) expected_output: STRUCT(CAST(64.0 AS FLOAT64) AS U, CAST(9.391056991171487E-4 AS FLOAT64) AS p) - +t_test: + - test: + input: ARRAY[13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0], ARRAY[22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0] + expected_output: STRUCT(CAST(2.8957935572829476 AS FLOAT64) AS t_value, CAST(21 AS INT64) AS dof) From e0f42e2fd095d3aa4b486f1004e13a44d88fbaf6 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 20 Jul 2021 19:44:55 +0000 Subject: [PATCH 045/104] Added T test to docs. --- udfs/community/README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/udfs/community/README.md b/udfs/community/README.md index 410664cc2..ec24ca65c 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -724,3 +724,26 @@ results: |-------------------| |0.8607079764250578 | +----- +### [t_test(ARRAY,ARRAY)](t_test.sql) + +Runs the Student's T-test. Well known test to compare populations. Example taken from here: [Sample](https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test/two-sample-t-test.html) + +Sample Query: + +```SQL +DECLARE pop1 ARRAY; +DECLARE pop2 ARRAY; + +SET pop1 = [13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0]; +SET pop2 = [22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0] ; + +SELECT `bqutils.fn.t_test`(pop1, pop2) AS actual_result_rows; + +``` + +Results: + +| Row | actual_result_rows.t_value | actual_result_rows.dof| +|-----|----------------------------|-----------------------| +| 1 | 2.8957935572829476 | 21 From 5699b1b2c96d4c0343009ded30f471d058585183 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 20 Jul 2021 19:59:42 +0000 Subject: [PATCH 046/104] Renamed pvalue.sql to chisquare_cdf --- udfs/community/README.md | 9 +++++---- udfs/community/{pvalue.sql => chisquare_cdf.sql} | 2 +- udfs/community/kruskal_wallis.sql | 2 +- udfs/community/test_cases.yaml | 5 ++++- 4 files changed, 11 insertions(+), 7 deletions(-) rename udfs/community/{pvalue.sql => chisquare_cdf.sql} (71%) diff --git a/udfs/community/README.md b/udfs/community/README.md index ec24ca65c..969a02b43 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -13,6 +13,7 @@ SELECT bqutil.fn.int(1.684) ## UDFs +* [chisquare_cdf](#chisquare_cdfh-float64-dof-float64) * [csv_to_struct](#csv_to_structstrlist-string) * [find_in_set](#find_in_setstr-string-strlist-string) * [freq_table](#freq_tablearr-any-type) @@ -30,7 +31,6 @@ SELECT bqutil.fn.int(1.684) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) * [percentage_change](#percentage_changeval1-float64-val2-float64) * [percentage_difference](#percentage_differenceval1-float64-val2-float64) -* [pvalue](#pvalueh-float64-dof-float64) * [radians](#radiansx-any-type) * [random_int](#random_intmin-any-type-max-any-type) * [random_value](#random_valuearr-any-type) @@ -40,6 +40,7 @@ SELECT bqutil.fn.int(1.684) * [ts_session_group](#ts_session_grouprow_ts-timestamp-prev_ts-timestamp-session_gap-int64) * [ts_slide](#ts_slidets-timestamp-period-int64-duration-int64) * [ts_tumble](#ts_tumbleinput_ts-timestamp-tumble_seconds-int64) +* [t_test](#t_testarrayarray) * [typeof](#typeofinput-any-type) * [url_keys](#url_keysquery-string) * [url_param](#url_paramquery-string-p-string) @@ -705,16 +706,16 @@ results: -### [pvalue(H FLOAT64, dof FLOAT64)](pvalue.sql) +### [chisquare_cdf(H FLOAT64, dof FLOAT64)](chisquare_cdf.sql) Takes _H_ and _dof_ and returns _p_ probability value. -The [pvalue](https://jstat.github.io/distributions.html#jStat.chisquare.cdf) is NULL Hypothesis probability of the Kruskal-Wallis (KW) test. This is obtained to be the CDF of the chisquare with the _H_ value and the Degrees of Freedom (_dof_) of the KW problem. +The [chisquare_cdf](https://jstat.github.io/distributions.html#jStat.chisquare.cdf) is NULL Hypothesis probability of the Kruskal-Wallis (KW) test. This is obtained to be the CDF of the chisquare with the _H_ value and the Degrees of Freedom (_dof_) of the KW problem. * Input: H FLOAT64, dof FLOAT64 * Output: p FLOAT64 * ```sql -SELECT `bqutils.fn.pvalue`(.3,2) AS results; +SELECT `bqutils.fn.chisquare_cdf`(.3,2) AS results; ``` results: diff --git a/udfs/community/pvalue.sql b/udfs/community/chisquare_cdf.sql similarity index 71% rename from udfs/community/pvalue.sql rename to udfs/community/chisquare_cdf.sql index 413b7844e..f768bffd1 100644 --- a/udfs/community/pvalue.sql +++ b/udfs/community/chisquare_cdf.sql @@ -1,6 +1,6 @@ #standardSQL -CREATE OR REPLACE FUNCTION fn.pvalue(H FLOAT64, dof INT64) +CREATE OR REPLACE FUNCTION fn.chisquare_cdf(H FLOAT64, dof INT64) RETURNS FLOAT64 LANGUAGE js AS """ return 1.0 - jstat.jStat['chisquare'].cdf(H, dof) diff --git a/udfs/community/kruskal_wallis.sql b/udfs/community/kruskal_wallis.sql index a385bd4c9..8edf1fe07 100644 --- a/udfs/community/kruskal_wallis.sql +++ b/udfs/community/kruskal_wallis.sql @@ -32,5 +32,5 @@ CREATE OR REPLACE FUNCTION fn.kruskal_wallis(data ARRAY[0.3, 2.0] expected_output: CAST(0.1392920235749422 AS FLOAT64) -pvalue: +chisquare_cdf: - test: input: CAST(0.3 AS FLOAT64), CAST(2 AS INT64) expected_output: CAST(0.8607079764250578 AS FLOAT64) @@ -460,3 +460,6 @@ t_test: - test: input: ARRAY[13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0], ARRAY[22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0] expected_output: STRUCT(CAST(2.8957935572829476 AS FLOAT64) AS t_value, CAST(21 AS INT64) AS dof) +# +# Please enter new tests above this section if they are not related to STATSLIB. +# \ No newline at end of file From b4e7d9448c1ee9ffd05b1ab903c7f41814ead5ed Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 17 Aug 2021 08:48:49 -0700 Subject: [PATCH 047/104] wip: implement basic piping for chi-square --- stored_procedures/chi_square.sql | 71 ++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 stored_procedures/chi_square.sql diff --git a/stored_procedures/chi_square.sql b/stored_procedures/chi_square.sql new file mode 100644 index 000000000..4f4cc3fca --- /dev/null +++ b/stored_procedures/chi_square.sql @@ -0,0 +1,71 @@ +-- @param STRING table_name table (or subquery) that contains the data +-- @param STRING independent_var name of the column in our table that represents our independent variable +-- @param STRING dependent_var name of the column in our table that represents our dependent variable +-- TODO: return struct rather than record (?) +-- @return RECORD + +-- TODO: check math (this isn't correct), use group by for performance +CREATE OR REPLACE PROCEDURE bqutil.procedure.chi_square (table_name STRING, independent_var STRING, dependent_var STRING ) +BEGIN +EXECUTE IMMEDIATE """ + WITH contingency_table AS ( + SELECT DISTINCT + @independent_var as independent_var, + @dependent_var as dependent_var, + COUNT(*) OVER(PARTITION BY @independent_var, @dependent_var) as count, + COUNT(*) OVER(PARTITION BY @independent_var) independent_total, + COUNT(*) OVER(PARTITION BY @dependent_var) dependent_total, + COUNT(*) OVER() as total + FROM + """ || table_name || """ AS t0 + ), + expected_table AS ( + SELECT + independent_var, + dependent_var, + independent_total * dependent_total / total as count + FROM `contingency_table` + ) + SELECT + SUM(POW(contingency_table.count - expected_table.count, 2) / expected_table.count) as chi_square, + (COUNT(DISTINCT contingency_table.independent_var) - 1) + * (COUNT(DISTINCT contingency_table.dependent_var) - 1) AS degrees_freedom + FROM contingency_table + INNER JOIN expected_table + ON expected_table.independent_var = contingency_table.independent_var + AND expected_table.dependent_var = contingency_table.dependent_var +""" USING table_name as table_name, independent_var as independent_var, dependent_var as dependent_var; +END; + +-- a unit test of chi_square +-- TODO: this is pretty slow, we should do one insert with lots of records rather than looping through +BEGIN + DECLARE i INT64 DEFAULT 0; + CREATE TEMP TABLE categorical (sex STRING, party STRING); + + WHILE i < 2 DO + INSERT INTO categorical (sex, party) VALUES('male', 'republican'); + SET i = i + 1; + END WHILE; + + SET i = 0; + WHILE i < 1 DO + INSERT INTO categorical (sex, party) VALUES('male', 'democrat'); + SET i = i + 1; + END WHILE; + + SET i = 0; + WHILE i < 3 DO + INSERT INTO categorical (sex, party) VALUES('female', 'republican'); + SET i = i + 1; + END WHILE; + + SET i = 0; + WHILE i < 2 DO + INSERT INTO categorical (sex, party) VALUES('female', 'democrat'); + SET i = i + 1; + END WHILE; + + CALL bqutil.procedure.chi_square('categorical', 'sex', 'party'); +-- TODO: print assertion that result is what we expect +END; From a6b902b8fab4b1281ae3eeb4ee62016886fba4e3 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 17 Aug 2021 09:09:38 -0700 Subject: [PATCH 048/104] fix: use string concatenation to return proper chi-square results --- stored_procedures/chi_square.sql | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/stored_procedures/chi_square.sql b/stored_procedures/chi_square.sql index 4f4cc3fca..19fc3fbe8 100644 --- a/stored_procedures/chi_square.sql +++ b/stored_procedures/chi_square.sql @@ -4,20 +4,19 @@ -- TODO: return struct rather than record (?) -- @return RECORD --- TODO: check math (this isn't correct), use group by for performance +-- TODO: Use group by for scalability/performance; break into several statements CREATE OR REPLACE PROCEDURE bqutil.procedure.chi_square (table_name STRING, independent_var STRING, dependent_var STRING ) BEGIN EXECUTE IMMEDIATE """ WITH contingency_table AS ( SELECT DISTINCT - @independent_var as independent_var, - @dependent_var as dependent_var, - COUNT(*) OVER(PARTITION BY @independent_var, @dependent_var) as count, - COUNT(*) OVER(PARTITION BY @independent_var) independent_total, - COUNT(*) OVER(PARTITION BY @dependent_var) dependent_total, + """ || independent_var || """ as independent_var, + """ || dependent_var || """ as dependent_var, + COUNT(*) OVER(PARTITION BY """ || independent_var || """, """ || dependent_var || """) as count, + COUNT(*) OVER(PARTITION BY """ || independent_var || """) independent_total, + COUNT(*) OVER(PARTITION BY """ || dependent_var || """) dependent_total, COUNT(*) OVER() as total - FROM - """ || table_name || """ AS t0 + FROM """ || table_name || """ AS t0 ), expected_table AS ( SELECT @@ -34,7 +33,7 @@ EXECUTE IMMEDIATE """ INNER JOIN expected_table ON expected_table.independent_var = contingency_table.independent_var AND expected_table.dependent_var = contingency_table.dependent_var -""" USING table_name as table_name, independent_var as independent_var, dependent_var as dependent_var; +"""; END; -- a unit test of chi_square From ccc349d174eecaf3df378743d778b7288e26c7bf Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 17 Aug 2021 09:37:44 -0700 Subject: [PATCH 049/104] feat: add linear_regression stored procedure --- stored_procedures/linear_regression.sql | 199 ++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 stored_procedures/linear_regression.sql diff --git a/stored_procedures/linear_regression.sql b/stored_procedures/linear_regression.sql new file mode 100644 index 000000000..fb1c81214 --- /dev/null +++ b/stored_procedures/linear_regression.sql @@ -0,0 +1,199 @@ +-- @param STRING table_name table (or subquery) that contains the data +-- @param STRING independent_var name of the column in our table that represents our independent variable +-- @param STRING dependent_var name of the column in our table that represents our dependent variable +-- @return STRUCT + +CREATE OR REPLACE PROCEDURE bqutil.procedure.linear_regression (table_name STRING, independent_var STRING, dependent_var STRING ) +BEGIN +EXECUTE IMMEDIATE """ + WITH results AS ( + WITH sums AS ( + WITH d AS ( + SELECT + """ || independent_var || """ AS x, + """ || dependent_var || """ AS y + FROM """ || table_name || """ + ) + SELECT + SUM(d.X) AS Sx, + SUM(d.Y) AS Sy, + SUM(d.X * d.Y) AS Sxy, + SUM(d.X * d.X) AS Sxx, + SUM(d.Y * d.Y) AS Syy, + COUNT(*) AS N + FROM d + ) + SELECT + ((Sy * Sxx) - (Sx * Sxy)) / ((N * (Sxx)) - (Sx * Sx)) AS a, + ((N * Sxy) - (Sx * Sy)) / ((N * Sxx) - (Sx * Sx)) AS b, + ((N * Sxy) - (Sx * Sy))/ SQRT( + (((N * Sxx) - (Sx * Sx))* ((N * Syy - (Sy * Sy))))) AS r + FROM sums + ) + SELECT STRUCT(a, b, r) FROM results; +"""; +END; + +-- a unit test of linear_regression +BEGIN + CREATE TEMP TABLE iris (sepal_length FLOAT64, sepal_width FLOAT64, petal_length FLOAT64, petal_width FLOAT64, species STRING) + AS + SELECT 5.1 AS sepal_length, + 3.5 AS sepal_width, + 1.4 AS petal_length, + 0.2 AS petal_width, + 'setosa' AS species + UNION ALL SELECT 4.9,3.0,1.4,0.2,'setosa' + UNION ALL SELECT 4.7,3.2,1.3,0.2,'setosa' + UNION ALL SELECT 4.6,3.1,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.6,1.4,0.2,'setosa' + UNION ALL SELECT 5.4,3.9,1.7,0.4,'setosa' + UNION ALL SELECT 4.6,3.4,1.4,0.3,'setosa' + UNION ALL SELECT 5.0,3.4,1.5,0.2,'setosa' + UNION ALL SELECT 4.4,2.9,1.4,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.4,3.7,1.5,0.2,'setosa' + UNION ALL SELECT 4.8,3.4,1.6,0.2,'setosa' + UNION ALL SELECT 4.8,3.0,1.4,0.1,'setosa' + UNION ALL SELECT 4.3,3.0,1.1,0.1,'setosa' + UNION ALL SELECT 5.8,4.0,1.2,0.2,'setosa' + UNION ALL SELECT 5.7,4.4,1.5,0.4,'setosa' + UNION ALL SELECT 5.4,3.9,1.3,0.4,'setosa' + UNION ALL SELECT 5.1,3.5,1.4,0.3,'setosa' + UNION ALL SELECT 5.7,3.8,1.7,0.3,'setosa' + UNION ALL SELECT 5.1,3.8,1.5,0.3,'setosa' + UNION ALL SELECT 5.4,3.4,1.7,0.2,'setosa' + UNION ALL SELECT 5.1,3.7,1.5,0.4,'setosa' + UNION ALL SELECT 4.6,3.6,1.0,0.2,'setosa' + UNION ALL SELECT 5.1,3.3,1.7,0.5,'setosa' + UNION ALL SELECT 4.8,3.4,1.9,0.2,'setosa' + UNION ALL SELECT 5.0,3.0,1.6,0.2,'setosa' + UNION ALL SELECT 5.0,3.4,1.6,0.4,'setosa' + UNION ALL SELECT 5.2,3.5,1.5,0.2,'setosa' + UNION ALL SELECT 5.2,3.4,1.4,0.2,'setosa' + UNION ALL SELECT 4.7,3.2,1.6,0.2,'setosa' + UNION ALL SELECT 4.8,3.1,1.6,0.2,'setosa' + UNION ALL SELECT 5.4,3.4,1.5,0.4,'setosa' + UNION ALL SELECT 5.2,4.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.5,4.2,1.4,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.0,3.2,1.2,0.2,'setosa' + UNION ALL SELECT 5.5,3.5,1.3,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 4.4,3.0,1.3,0.2,'setosa' + UNION ALL SELECT 5.1,3.4,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.5,1.3,0.3,'setosa' + UNION ALL SELECT 4.5,2.3,1.3,0.3,'setosa' + UNION ALL SELECT 4.4,3.2,1.3,0.2,'setosa' + UNION ALL SELECT 5.0,3.5,1.6,0.6,'setosa' + UNION ALL SELECT 5.1,3.8,1.9,0.4,'setosa' + UNION ALL SELECT 4.8,3.0,1.4,0.3,'setosa' + UNION ALL SELECT 5.1,3.8,1.6,0.2,'setosa' + UNION ALL SELECT 4.6,3.2,1.4,0.2,'setosa' + UNION ALL SELECT 5.3,3.7,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.3,1.4,0.2,'setosa' + UNION ALL SELECT 7.0,3.2,4.7,1.4,'versicolor' + UNION ALL SELECT 6.4,3.2,4.5,1.5,'versicolor' + UNION ALL SELECT 6.9,3.1,4.9,1.5,'versicolor' + UNION ALL SELECT 5.5,2.3,4.0,1.3,'versicolor' + UNION ALL SELECT 6.5,2.8,4.6,1.5,'versicolor' + UNION ALL SELECT 5.7,2.8,4.5,1.3,'versicolor' + UNION ALL SELECT 6.3,3.3,4.7,1.6,'versicolor' + UNION ALL SELECT 4.9,2.4,3.3,1.0,'versicolor' + UNION ALL SELECT 6.6,2.9,4.6,1.3,'versicolor' + UNION ALL SELECT 5.2,2.7,3.9,1.4,'versicolor' + UNION ALL SELECT 5.0,2.0,3.5,1.0,'versicolor' + UNION ALL SELECT 5.9,3.0,4.2,1.5,'versicolor' + UNION ALL SELECT 6.0,2.2,4.0,1.0,'versicolor' + UNION ALL SELECT 6.1,2.9,4.7,1.4,'versicolor' + UNION ALL SELECT 5.6,2.9,3.6,1.3,'versicolor' + UNION ALL SELECT 6.7,3.1,4.4,1.4,'versicolor' + UNION ALL SELECT 5.6,3.0,4.5,1.5,'versicolor' + UNION ALL SELECT 5.8,2.7,4.1,1.0,'versicolor' + UNION ALL SELECT 6.2,2.2,4.5,1.5,'versicolor' + UNION ALL SELECT 5.6,2.5,3.9,1.1,'versicolor' + UNION ALL SELECT 5.9,3.2,4.8,1.8,'versicolor' + UNION ALL SELECT 6.1,2.8,4.0,1.3,'versicolor' + UNION ALL SELECT 6.3,2.5,4.9,1.5,'versicolor' + UNION ALL SELECT 6.1,2.8,4.7,1.2,'versicolor' + UNION ALL SELECT 6.4,2.9,4.3,1.3,'versicolor' + UNION ALL SELECT 6.6,3.0,4.4,1.4,'versicolor' + UNION ALL SELECT 6.8,2.8,4.8,1.4,'versicolor' + UNION ALL SELECT 6.7,3.0,5.0,1.7,'versicolor' + UNION ALL SELECT 6.0,2.9,4.5,1.5,'versicolor' + UNION ALL SELECT 5.7,2.6,3.5,1.0,'versicolor' + UNION ALL SELECT 5.5,2.4,3.8,1.1,'versicolor' + UNION ALL SELECT 5.5,2.4,3.7,1.0,'versicolor' + UNION ALL SELECT 5.8,2.7,3.9,1.2,'versicolor' + UNION ALL SELECT 6.0,2.7,5.1,1.6,'versicolor' + UNION ALL SELECT 5.4,3.0,4.5,1.5,'versicolor' + UNION ALL SELECT 6.0,3.4,4.5,1.6,'versicolor' + UNION ALL SELECT 6.7,3.1,4.7,1.5,'versicolor' + UNION ALL SELECT 6.3,2.3,4.4,1.3,'versicolor' + UNION ALL SELECT 5.6,3.0,4.1,1.3,'versicolor' + UNION ALL SELECT 5.5,2.5,4.0,1.3,'versicolor' + UNION ALL SELECT 5.5,2.6,4.4,1.2,'versicolor' + UNION ALL SELECT 6.1,3.0,4.6,1.4,'versicolor' + UNION ALL SELECT 5.8,2.6,4.0,1.2,'versicolor' + UNION ALL SELECT 5.0,2.3,3.3,1.0,'versicolor' + UNION ALL SELECT 5.6,2.7,4.2,1.3,'versicolor' + UNION ALL SELECT 5.7,3.0,4.2,1.2,'versicolor' + UNION ALL SELECT 5.7,2.9,4.2,1.3,'versicolor' + UNION ALL SELECT 6.2,2.9,4.3,1.3,'versicolor' + UNION ALL SELECT 5.1,2.5,3.0,1.1,'versicolor' + UNION ALL SELECT 5.7,2.8,4.1,1.3,'versicolor' + UNION ALL SELECT 6.3,3.3,6.0,2.5,'virginica' + UNION ALL SELECT 5.8,2.7,5.1,1.9,'virginica' + UNION ALL SELECT 7.1,3.0,5.9,2.1,'virginica' + UNION ALL SELECT 6.3,2.9,5.6,1.8,'virginica' + UNION ALL SELECT 6.5,3.0,5.8,2.2,'virginica' + UNION ALL SELECT 7.6,3.0,6.6,2.1,'virginica' + UNION ALL SELECT 4.9,2.5,4.5,1.7,'virginica' + UNION ALL SELECT 7.3,2.9,6.3,1.8,'virginica' + UNION ALL SELECT 6.7,2.5,5.8,1.8,'virginica' + UNION ALL SELECT 7.2,3.6,6.1,2.5,'virginica' + UNION ALL SELECT 6.5,3.2,5.1,2.0,'virginica' + UNION ALL SELECT 6.4,2.7,5.3,1.9,'virginica' + UNION ALL SELECT 6.8,3.0,5.5,2.1,'virginica' + UNION ALL SELECT 5.7,2.5,5.0,2.0,'virginica' + UNION ALL SELECT 5.8,2.8,5.1,2.4,'virginica' + UNION ALL SELECT 6.4,3.2,5.3,2.3,'virginica' + UNION ALL SELECT 6.5,3.0,5.5,1.8,'virginica' + UNION ALL SELECT 7.7,3.8,6.7,2.2,'virginica' + UNION ALL SELECT 7.7,2.6,6.9,2.3,'virginica' + UNION ALL SELECT 6.0,2.2,5.0,1.5,'virginica' + UNION ALL SELECT 6.9,3.2,5.7,2.3,'virginica' + UNION ALL SELECT 5.6,2.8,4.9,2.0,'virginica' + UNION ALL SELECT 7.7,2.8,6.7,2.0,'virginica' + UNION ALL SELECT 6.3,2.7,4.9,1.8,'virginica' + UNION ALL SELECT 6.7,3.3,5.7,2.1,'virginica' + UNION ALL SELECT 7.2,3.2,6.0,1.8,'virginica' + UNION ALL SELECT 6.2,2.8,4.8,1.8,'virginica' + UNION ALL SELECT 6.1,3.0,4.9,1.8,'virginica' + UNION ALL SELECT 6.4,2.8,5.6,2.1,'virginica' + UNION ALL SELECT 7.2,3.0,5.8,1.6,'virginica' + UNION ALL SELECT 7.4,2.8,6.1,1.9,'virginica' + UNION ALL SELECT 7.9,3.8,6.4,2.0,'virginica' + UNION ALL SELECT 6.4,2.8,5.6,2.2,'virginica' + UNION ALL SELECT 6.3,2.8,5.1,1.5,'virginica' + UNION ALL SELECT 6.1,2.6,5.6,1.4,'virginica' + UNION ALL SELECT 7.7,3.0,6.1,2.3,'virginica' + UNION ALL SELECT 6.3,3.4,5.6,2.4,'virginica' + UNION ALL SELECT 6.4,3.1,5.5,1.8,'virginica' + UNION ALL SELECT 6.0,3.0,4.8,1.8,'virginica' + UNION ALL SELECT 6.9,3.1,5.4,2.1,'virginica' + UNION ALL SELECT 6.7,3.1,5.6,2.4,'virginica' + UNION ALL SELECT 6.9,3.1,5.1,2.3,'virginica' + UNION ALL SELECT 5.8,2.7,5.1,1.9,'virginica' + UNION ALL SELECT 6.8,3.2,5.9,2.3,'virginica' + UNION ALL SELECT 6.7,3.3,5.7,2.5,'virginica' + UNION ALL SELECT 6.7,3.0,5.2,2.3,'virginica' + UNION ALL SELECT 6.3,2.5,5.0,1.9,'virginica' + UNION ALL SELECT 6.5,3.0,5.2,2.0,'virginica' + UNION ALL SELECT 6.2,3.4,5.4,2.3,'virginica' + UNION ALL SELECT 5.9,3.0,5.1,1.8,'virginica'; + + + CALL bqutil.procedure.linear_regression('iris', 'sepal_width', 'petal_width'); +-- TODO: print assertion that result is what we expect +END; \ No newline at end of file From bd9c943a352829b94cebd7558462f4a71710f027 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Wed, 18 Aug 2021 16:58:40 -0700 Subject: [PATCH 050/104] implement assertions in unit tests, finalize argument signature --- stored_procedures/chi_square.sql | 72 ++++++++++++------------- stored_procedures/linear_regression.sql | 13 +++-- 2 files changed, 42 insertions(+), 43 deletions(-) diff --git a/stored_procedures/chi_square.sql b/stored_procedures/chi_square.sql index 19fc3fbe8..40ec06a58 100644 --- a/stored_procedures/chi_square.sql +++ b/stored_procedures/chi_square.sql @@ -1,11 +1,9 @@ -- @param STRING table_name table (or subquery) that contains the data -- @param STRING independent_var name of the column in our table that represents our independent variable -- @param STRING dependent_var name of the column in our table that represents our dependent variable --- TODO: return struct rather than record (?) --- @return RECORD +-- @return STRUCT x is the chi-square statistic, dof is degrees of freedom, p is the pvalue --- TODO: Use group by for scalability/performance; break into several statements -CREATE OR REPLACE PROCEDURE bqutil.procedure.chi_square (table_name STRING, independent_var STRING, dependent_var STRING ) +CREATE OR REPLACE PROCEDURE bqutil.procedure.chi_square (table_name STRING, independent_var STRING, dependent_var STRING, OUT result STRUCT ) BEGIN EXECUTE IMMEDIATE """ WITH contingency_table AS ( @@ -24,47 +22,43 @@ EXECUTE IMMEDIATE """ dependent_var, independent_total * dependent_total / total as count FROM `contingency_table` + ), + output AS ( + SELECT + SUM(POW(contingency_table.count - expected_table.count, 2) / expected_table.count) as x, + (COUNT(DISTINCT contingency_table.independent_var) - 1) + * (COUNT(DISTINCT contingency_table.dependent_var) - 1) AS dof + FROM contingency_table + INNER JOIN expected_table + ON expected_table.independent_var = contingency_table.independent_var + AND expected_table.dependent_var = contingency_table.dependent_var ) - SELECT - SUM(POW(contingency_table.count - expected_table.count, 2) / expected_table.count) as chi_square, - (COUNT(DISTINCT contingency_table.independent_var) - 1) - * (COUNT(DISTINCT contingency_table.dependent_var) - 1) AS degrees_freedom - FROM contingency_table - INNER JOIN expected_table - ON expected_table.independent_var = contingency_table.independent_var - AND expected_table.dependent_var = contingency_table.dependent_var -"""; + SELECT STRUCT (x, dof, bqutil.fn.pvalue(x, dof) AS p) FROM output +""" INTO result; END; -- a unit test of chi_square --- TODO: this is pretty slow, we should do one insert with lots of records rather than looping through BEGIN - DECLARE i INT64 DEFAULT 0; - CREATE TEMP TABLE categorical (sex STRING, party STRING); - - WHILE i < 2 DO - INSERT INTO categorical (sex, party) VALUES('male', 'republican'); - SET i = i + 1; - END WHILE; - - SET i = 0; - WHILE i < 1 DO - INSERT INTO categorical (sex, party) VALUES('male', 'democrat'); - SET i = i + 1; - END WHILE; + DECLARE result STRUCT; - SET i = 0; - WHILE i < 3 DO - INSERT INTO categorical (sex, party) VALUES('female', 'republican'); - SET i = i + 1; - END WHILE; + CREATE TEMP TABLE categorical (animal STRING, toy STRING) AS + SELECT 'dog' AS animal, 'ball' as toy + UNION ALL SELECT 'dog', 'ball' + UNION ALL SELECT 'dog', 'ball' + UNION ALL SELECT 'dog', 'ball' + UNION ALL SELECT 'dog', 'yarn' + UNION ALL SELECT 'dog', 'yarn' + UNION ALL SELECT 'cat', 'ball' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn'; - SET i = 0; - WHILE i < 2 DO - INSERT INTO categorical (sex, party) VALUES('female', 'democrat'); - SET i = i + 1; - END WHILE; + CALL bqutil.procedure.chi_square('categorical', 'animal', 'toy', result); - CALL bqutil.procedure.chi_square('categorical', 'sex', 'party'); --- TODO: print assertion that result is what we expect + ASSERT result.x = 3.7452380952380966; + ASSERT result.dof = 1; + ASSERT result.p = 0.052958181867438725; END; diff --git a/stored_procedures/linear_regression.sql b/stored_procedures/linear_regression.sql index fb1c81214..82de015b3 100644 --- a/stored_procedures/linear_regression.sql +++ b/stored_procedures/linear_regression.sql @@ -3,7 +3,7 @@ -- @param STRING dependent_var name of the column in our table that represents our dependent variable -- @return STRUCT -CREATE OR REPLACE PROCEDURE bqutil.procedure.linear_regression (table_name STRING, independent_var STRING, dependent_var STRING ) +CREATE OR REPLACE PROCEDURE bqutil.procedure.linear_regression (table_name STRING, independent_var STRING, dependent_var STRING, OUT result STRUCT ) BEGIN EXECUTE IMMEDIATE """ WITH results AS ( @@ -31,11 +31,12 @@ EXECUTE IMMEDIATE """ FROM sums ) SELECT STRUCT(a, b, r) FROM results; -"""; +""" INTO result; END; -- a unit test of linear_regression BEGIN + DECLARE result STRUCT; CREATE TEMP TABLE iris (sepal_length FLOAT64, sepal_width FLOAT64, petal_length FLOAT64, petal_width FLOAT64, species STRING) AS SELECT 5.1 AS sepal_length, @@ -194,6 +195,10 @@ BEGIN UNION ALL SELECT 5.9,3.0,5.1,1.8,'virginica'; - CALL bqutil.procedure.linear_regression('iris', 'sepal_width', 'petal_width'); --- TODO: print assertion that result is what we expect + CALL bqutil.procedure.linear_regression('iris', 'sepal_width', 'petal_width', result); + + -- We round to 11 decimals here because there appears to be some inconsistency in the function, likely due to floating point errors and the order of aggregation + ASSERT ROUND(result.a, 11) = 3.11519268710; + ASSERT ROUND(result.b, 11) = -0.62754617565; + ASSERT ROUND(result.r, 11) = -0.35654408961; END; \ No newline at end of file From a6a56824fd7979cbe3b5eb1e96b4a87b849dc79a Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Mon, 27 Sep 2021 17:45:53 -0700 Subject: [PATCH 051/104] feat: implement bh_multiple_tests --- stored_procedures/bh_multiple_tests.sql | 74 +++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 stored_procedures/bh_multiple_tests.sql diff --git a/stored_procedures/bh_multiple_tests.sql b/stored_procedures/bh_multiple_tests.sql new file mode 100644 index 000000000..9d6f4a380 --- /dev/null +++ b/stored_procedures/bh_multiple_tests.sql @@ -0,0 +1,74 @@ +-- Adjust p values using the Benjamini-Hochberg multipletests method, additional details in doi:10.1098/rsta.2009.0127 +-- the implementation can be compared with the python function 'statsmodels.stats.multitest.multipletests' (method='fdr_bh') + +-- @param STRING pvalue_table_name : the name of the table with the p values that need to be adjusted +-- @param STRING pvalue_column_name : the name of the column with p values. +-- @param INT Nrows : Number of tests (equal to number of rows of the input table) +CREATE OR REPLACE PROCEDURE bqutil.procedure.bh_multiple_tests (pvalue_table_name STRING, pvalue_column_name STRING, n_rows INT64) +BEGIN + EXECUTE IMMEDIATE format(""" + CREATE TEMP TABLE bh_multiple_tests_results AS + WITH padjusted_data AS ( + WITH ranked_data AS ( + SELECT *, ( DENSE_RANK() OVER( ORDER BY %s) ) AS jrank + FROM %s + ) + SELECT *, + MIN( %d * %s / jrank ) + OVER ( + ORDER BY jrank DESC + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW + ) AS p_adj + FROM ranked_data + ) + SELECT * EXCEPT (p_adj, jrank), IF( p_adj > 1.0 , 1.0, p_adj) AS p_adj + FROM padjusted_data + ORDER BY jrank""", pvalue_column_name, pvalue_table_name, n_rows, pvalue_column_name ); +END; + +-- a unit test of bh_multiple_tests +BEGIN + + CREATE TEMP TABLE Pvalues AS + SELECT 0.001 as pval + UNION ALL SELECT 0.008 + UNION ALL SELECT 0.039 + UNION ALL SELECT 0.041 + UNION ALL SELECT 0.042 + UNION ALL SELECT 0.06 + UNION ALL SELECT 0.074 + UNION ALL SELECT 0.205; + + CALL bqutil.procedure.bh_multiple_tests('Pvalues','pval',8); + + # Table Output + # pval p_adj + # 0.001 0.008 + # 0.008 0.032 + # 0.039 0.06720000000000001 + # 0.041 0.06720000000000001 + # 0.042 0.06720000000000001 + # 0.06 0.08 + # 0.074 0.08457142857142856 + # 0.205 0.205 + + ASSERT( + SELECT COUNTIF( + (pval = 0.001 AND p_adj = 0.008) + OR (pval = 0.008 AND p_adj = 0.032) + OR (pval = 0.039 AND p_adj = 0.06720000000000001) + OR (pval = 0.041 AND p_adj = 0.06720000000000001) + OR (pval = 0.042 AND p_adj = 0.06720000000000001) + OR (pval = 0.06 AND p_adj = 0.08) + OR (pval = 0.074 AND p_adj = 0.08457142857142856) + OR (pval = 0.205 AND p_adj = 0.205) + ) + FROM bh_multiple_tests_results + ) = 8; + + + + +END; + + From 2c44d7965ed234429c44c4057894802bd6a111e5 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 28 Sep 2021 10:50:09 -0700 Subject: [PATCH 052/104] feat: pass temp_table_name parameter to bh_multiple_tests --- stored_procedures/bh_multiple_tests.sql | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/stored_procedures/bh_multiple_tests.sql b/stored_procedures/bh_multiple_tests.sql index 9d6f4a380..eba354548 100644 --- a/stored_procedures/bh_multiple_tests.sql +++ b/stored_procedures/bh_multiple_tests.sql @@ -4,10 +4,10 @@ -- @param STRING pvalue_table_name : the name of the table with the p values that need to be adjusted -- @param STRING pvalue_column_name : the name of the column with p values. -- @param INT Nrows : Number of tests (equal to number of rows of the input table) -CREATE OR REPLACE PROCEDURE bqutil.procedure.bh_multiple_tests (pvalue_table_name STRING, pvalue_column_name STRING, n_rows INT64) +CREATE OR REPLACE PROCEDURE bqutil.procedure.bh_multiple_tests (pvalue_table_name STRING, pvalue_column_name STRING, n_rows INT64, temp_table_name STRING ) BEGIN EXECUTE IMMEDIATE format(""" - CREATE TEMP TABLE bh_multiple_tests_results AS + CREATE TEMP TABLE %s AS WITH padjusted_data AS ( WITH ranked_data AS ( SELECT *, ( DENSE_RANK() OVER( ORDER BY %s) ) AS jrank @@ -23,7 +23,9 @@ BEGIN ) SELECT * EXCEPT (p_adj, jrank), IF( p_adj > 1.0 , 1.0, p_adj) AS p_adj FROM padjusted_data - ORDER BY jrank""", pvalue_column_name, pvalue_table_name, n_rows, pvalue_column_name ); + ORDER BY jrank""", temp_table_name, pvalue_column_name, pvalue_table_name, n_rows, pvalue_column_name ); + + EXECUTE IMMEDIATE format("""SELECT * FROM %s""", temp_table_name); END; -- a unit test of bh_multiple_tests @@ -39,7 +41,7 @@ BEGIN UNION ALL SELECT 0.074 UNION ALL SELECT 0.205; - CALL bqutil.procedure.bh_multiple_tests('Pvalues','pval',8); + CALL bqutil.procedure.bh_multiple_tests('Pvalues','pval',8, 'bh_multiple_tests_results'); # Table Output # pval p_adj From 3b0e045a5ed6e7c72cb9682d4c90a6a8d5bc1498 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Wed, 6 Oct 2021 20:13:35 -0700 Subject: [PATCH 053/104] Documentation for new procedures --- stored_procedures/README.md | 61 +++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/stored_procedures/README.md b/stored_procedures/README.md index 29d86e64b..7e2d8f07c 100644 --- a/stored_procedures/README.md +++ b/stored_procedures/README.md @@ -29,3 +29,64 @@ END; IDs are: [99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109] ``` + +### [chi_square(table_name STRING, independent_var STRING, dependent_var STRING, OUT result STRUCT)](chi_square.sql) +Performs a chi-square statistical test from an input table. It generates a structure containg the chi-square statistics, the degrees of freedom, and the pvalue of the test. +```sql + BEGIN + DECLARE result STRUCT; + + CREATE TEMP TABLE categorical (animal STRING, toy STRING) AS + SELECT 'dog' AS animal, 'ball' as toy + UNION ALL SELECT 'dog', 'ball' + UNION ALL SELECT 'dog', 'ball' + UNION ALL SELECT 'dog', 'ball' + UNION ALL SELECT 'dog', 'yarn' + UNION ALL SELECT 'dog', 'yarn' + UNION ALL SELECT 'cat', 'ball' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn' + UNION ALL SELECT 'cat', 'yarn'; + + CALL bqutil.procedure.chi_square('categorical', 'animal', 'toy', result); + SELECT result ; +END +``` +Output: +| result.x | result.dof | result.p | +|---|---|---| +| 3.7452380952380966 | 1.0 | 0.052958181867438725 | + +### [bh_multiple_tests( pvalue_table_name STRING, pvalue_column_name STRING, n_rows INT64, temp_table_name STRING )](bh_multiple_tests.sql) +Adjust p values using the Benjamini-Hochberg multipletests method, additional details in doi:10.1098/rsta.2009.0127 + +```sql +BEGIN + CREATE TEMP TABLE Pvalues AS + SELECT 0.001 as pval + UNION ALL SELECT 0.008 + UNION ALL SELECT 0.039 + UNION ALL SELECT 0.041 + UNION ALL SELECT 0.042 + UNION ALL SELECT 0.06 + UNION ALL SELECT 0.074 + UNION ALL SELECT 0.205; + + CALL bqutil.procedure.bh_multiple_tests('Pvalues','pval',8, 'bh_multiple_tests_results'); + SELECT * FROM bh_multiple_tests_results; +END; +``` +Output: +| pval | pval_adj | +|---|---| +| 0.008 | 0.032 | +| 0.039 | 0.06720000000000001 | +| 0.041 | 0.06720000000000001 | +| 0.042 | 0.06720000000000001 | +| 0.06 | 0.08 | +| 0.074 | 0.08457142857142856 | +| 0.205 | 0.205 | + From 4be37057169fb7bd7f53f430df2d213adbd52ec9 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 12 Oct 2021 13:38:15 -0400 Subject: [PATCH 054/104] Remove commenting of line in run.sh --- udfs/tests/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/tests/run.sh b/udfs/tests/run.sh index 2327394c5..6e8e0705e 100755 --- a/udfs/tests/run.sh +++ b/udfs/tests/run.sh @@ -19,4 +19,4 @@ python3 tests/udf_test_utils.py --create-test-datasets python3 -m pytest --workers 100 tests/create_udf_signatures.py "$@" python3 -m pytest --workers 100 tests/test_create_udfs.py "$@" python3 -m pytest --workers 100 tests/test_run_udfs.py "$@" -#python3 tests/udf_test_utils.py --delete-test-datasets +python3 tests/udf_test_utils.py --delete-test-datasets From 402266fc8f38bd4b9cb2e129304139d2a01f285e Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 12 Oct 2021 18:25:08 +0000 Subject: [PATCH 055/104] Updated sql files to be sqlx --- udfs/community/chisquare_cdf.sql | 15 -------- udfs/community/chisquare_cdf.sqlx | 26 +++++++++++++ udfs/community/corr_pvalue.sql | 19 ---------- udfs/community/corr_pvalue.sqlx | 37 +++++++++++++++++++ udfs/community/jstat.sql | 14 ------- udfs/community/jstat.sqlx | 32 ++++++++++++++++ udfs/community/normal_cdf.sql | 8 ---- udfs/community/normal_cdf.sqlx | 26 +++++++++++++ .../{p_fisherexact.sql => p_fisherexact.sqlx} | 20 +++++++++- udfs/community/{t_test.sql => t_test.sqlx} | 7 +++- 10 files changed, 145 insertions(+), 59 deletions(-) delete mode 100644 udfs/community/chisquare_cdf.sql create mode 100644 udfs/community/chisquare_cdf.sqlx delete mode 100644 udfs/community/corr_pvalue.sql create mode 100644 udfs/community/corr_pvalue.sqlx delete mode 100644 udfs/community/jstat.sql create mode 100644 udfs/community/jstat.sqlx delete mode 100644 udfs/community/normal_cdf.sql create mode 100644 udfs/community/normal_cdf.sqlx rename udfs/community/{p_fisherexact.sql => p_fisherexact.sqlx} (94%) rename udfs/community/{t_test.sql => t_test.sqlx} (90%) diff --git a/udfs/community/chisquare_cdf.sql b/udfs/community/chisquare_cdf.sql deleted file mode 100644 index 974ff1044..000000000 --- a/udfs/community/chisquare_cdf.sql +++ /dev/null @@ -1,15 +0,0 @@ -config { hasOutput: true } -#standardSQL - -<<<<<<<< HEAD:udfs/community/chisquare_cdf.sql -CREATE OR REPLACE FUNCTION fn.chisquare_cdf(H FLOAT64, dof INT64) -======== -CREATE OR REPLACE FUNCTION ${self()}(H FLOAT64, dof INT64) ->>>>>>>> upstream/master:udfs/community/pvalue.sqlx -RETURNS FLOAT64 -LANGUAGE js AS """ - return 1.0 - jstat.jStat['chisquare'].cdf(H, dof) -""" -OPTIONS ( - library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] -); diff --git a/udfs/community/chisquare_cdf.sqlx b/udfs/community/chisquare_cdf.sqlx new file mode 100644 index 000000000..5f291a74e --- /dev/null +++ b/udfs/community/chisquare_cdf.sqlx @@ -0,0 +1,26 @@ +config { hasOutput: true } +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#standardSQL + +CREATE OR REPLACE FUNCTION ${self()}(H FLOAT64, dof INT64) +RETURNS FLOAT64 +LANGUAGE js AS """ + return 1.0 - jstat.jStat['chisquare'].cdf(H, dof) +""" +OPTIONS ( + library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] +); diff --git a/udfs/community/corr_pvalue.sql b/udfs/community/corr_pvalue.sql deleted file mode 100644 index 0d28d2602..000000000 --- a/udfs/community/corr_pvalue.sql +++ /dev/null @@ -1,19 +0,0 @@ --- corr_pvalue: --- Input: --- r: correlation value, n : number of samples --- Output: The p value of the correlation -CREATE OR REPLACE FUNCTION fn.corr_pvalue(r FLOAT64, n INT64 ) -RETURNS FLOAT64 -LANGUAGE js AS """ -var abs_r = Math['abs'](r) -if ( abs_r < 1.0 ) { - var t = abs_r * Math['sqrt']( (n-2) / (1.0 - (r*r)) ) - return jstat['jStat']['ttest'](t,n-2,2); - -} else if (abs_r == 1.0 ) { - return 0.0 -} else { - return NaN -} -""" -OPTIONS (library=["${JS_BUCKET}/jstat-v1.9.4.min.js"]); diff --git a/udfs/community/corr_pvalue.sqlx b/udfs/community/corr_pvalue.sqlx new file mode 100644 index 000000000..e6ebc4e0c --- /dev/null +++ b/udfs/community/corr_pvalue.sqlx @@ -0,0 +1,37 @@ +config { hasOutput: true } + +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- corr_pvalue: +-- Input: +-- r: correlation value, n : number of samples +-- Output: The p value of the correlation +CREATE OR REPLACE FUNCTION ${self()}(r FLOAT64, n INT64 ) +RETURNS FLOAT64 +LANGUAGE js AS """ +var abs_r = Math['abs'](r) +if ( abs_r < 1.0 ) { + var t = abs_r * Math['sqrt']( (n-2) / (1.0 - (r*r)) ) + return jstat['jStat']['ttest'](t,n-2,2); + +} else if (abs_r == 1.0 ) { + return 0.0 +} else { + return NaN +} +""" +OPTIONS (library=["${JS_BUCKET}/jstat-v1.9.4.min.js"]); diff --git a/udfs/community/jstat.sql b/udfs/community/jstat.sql deleted file mode 100644 index 37f4b1e3f..000000000 --- a/udfs/community/jstat.sql +++ /dev/null @@ -1,14 +0,0 @@ -CREATE OR REPLACE FUNCTION fn.jstat(method STRING, args ARRAY) -RETURNS FLOAT64 -LANGUAGE js AS """ - const methodPath = method['split']('.') - let fn = jstat['jStat'] - for (const name of methodPath){ - fn = fn[name] - } - - return fn(...args) -""" -OPTIONS ( - library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] -); diff --git a/udfs/community/jstat.sqlx b/udfs/community/jstat.sqlx new file mode 100644 index 000000000..9390500a2 --- /dev/null +++ b/udfs/community/jstat.sqlx @@ -0,0 +1,32 @@ +config { hasOutput: true } + +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CREATE OR REPLACE FUNCTION ${self()}(method STRING, args ARRAY) +RETURNS FLOAT64 +LANGUAGE js AS """ + const methodPath = method['split']('.') + let fn = jstat['jStat'] + for (const name of methodPath){ + fn = fn[name] + } + + return fn(...args) +""" +OPTIONS ( + library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] +); diff --git a/udfs/community/normal_cdf.sql b/udfs/community/normal_cdf.sql deleted file mode 100644 index 2f9f01d68..000000000 --- a/udfs/community/normal_cdf.sql +++ /dev/null @@ -1,8 +0,0 @@ -CREATE OR REPLACE FUNCTION fn.normal_cdf(x FLOAT64, mean FLOAT64, std FLOAT64) -RETURNS FLOAT64 -LANGUAGE js AS """ - return jstat['jStat']['normal']['cdf']( x, mean, std ) -""" -OPTIONS ( - library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] -); diff --git a/udfs/community/normal_cdf.sqlx b/udfs/community/normal_cdf.sqlx new file mode 100644 index 000000000..56371ad8c --- /dev/null +++ b/udfs/community/normal_cdf.sqlx @@ -0,0 +1,26 @@ +config { hasOutput: true } + +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CREATE OR REPLACE FUNCTION ${self()}(x FLOAT64, mean FLOAT64, std FLOAT64) +RETURNS FLOAT64 +LANGUAGE js AS """ + return jstat['jStat']['normal']['cdf']( x, mean, std ) +""" +OPTIONS ( + library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] +); diff --git a/udfs/community/p_fisherexact.sql b/udfs/community/p_fisherexact.sqlx similarity index 94% rename from udfs/community/p_fisherexact.sql rename to udfs/community/p_fisherexact.sqlx index e9e91b118..c2291bf3d 100644 --- a/udfs/community/p_fisherexact.sql +++ b/udfs/community/p_fisherexact.sqlx @@ -1,6 +1,24 @@ +config { hasOutput: true } + +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + -- Computes the p value of the Fisher exact test -- PARAMETERES: a,b,c,d (values of 2x2 contingency table [[a,b];[c,d]] ) -CREATE OR REPLACE FUNCTION fn.p_fisherexact(a FLOAT64, b FLOAT64, c FLOAT64, d FLOAT64) +CREATE OR REPLACE FUNCTION ${self()}(a FLOAT64, b FLOAT64, c FLOAT64, d FLOAT64) RETURNS FLOAT64 LANGUAGE js AS """ var gl_LnF = [0.000000000000, 0.000000000000,0.693147180560,1.791759469228,3.178053830348,4.787491742782,6.579251212010,8.525161361065,10.604602902745,12.801827480081,15.104412573076,17.502307845874,19.987214495662,22.552163853123,25.191221182739,27.899271383841,30.671860106081,33.505073450137,36.395445208033,39.339884187199,42.335616460753,45.380138898477,48.471181351835,51.606675567764,54.784729398112,58.003605222981,61.261701761002,64.557538627006,67.889743137182,71.257038967168,74.658236348830,78.092223553315,81.557959456115,85.054467017582,88.580827542198,92.136175603687,95.719694542143,99.330612454787,102.968198614514,106.631760260643,110.320639714757,114.034211781462,117.771881399745,121.533081515439,125.317271149357,129.123933639127,132.952575035616,136.802722637326,140.673923648234,144.565743946345,148.477766951773,152.409592584497,156.360836303079,160.331128216631,164.320112263195,168.327445448428,172.352797139163,176.395848406997,180.456291417544,184.533828861450,188.628173423672,192.739047287845,196.866181672890,201.009316399282,205.168199482641,209.342586752537,213.532241494563,217.736934113954,221.956441819130,226.190548323728,230.439043565777,234.701723442818,238.978389561834,243.268849002983,247.572914096187,251.890402209723,256.221135550010,260.564940971863,264.921649798553,269.291097651020,273.673124285694,278.067573440366,282.474292687630,286.893133295427,291.323950094270,295.766601350761,300.220948647014,304.686856765669,309.164193580147,313.652829949879,318.152639620209,322.663499126726,327.185287703775,331.717887196929,336.261181979199,340.815058870799,345.379407062267,349.954118040770,354.539085519441,359.134205369576,363.739375555564,368.354496072405,372.979468885689,377.614197873919,382.258588773060,386.912549123218,391.575988217330,396.248817051792,400.930948278916,405.622296161145,410.322776526937,415.032306728250,419.750805599545,424.478193418257,429.214391866652,433.959323995015,438.712914186121,443.475088120919,448.245772745385,453.024896238496,457.812387981278,462.608178526875,467.412199571608,472.224383926981,477.044665492586,481.872979229888,486.709261136840,491.553448223298,496.405478487218,501.265290891579,506.132825342035,511.008022665236,515.890824587823,520.781173716044,525.679013515995,530.584288294434,535.496943180170,540.416924105998,545.344177791155,550.278651724286,555.220294146895,560.169054037273,565.124881094874,570.087725725134,575.057539024710,580.034272767131,585.017879388839,590.008311975618,595.005524249382,600.009470555328,605.020105849424,610.037385686239,615.061266207085,620.091704128478,625.128656730891,630.172081847810,635.221937855060,640.278183660408,645.340778693435,650.409682895655,655.484856710889,660.566261075874,665.653857411106,670.747607611913,675.847474039737,680.953419513638,686.065407301994,691.183401114411,696.307365093814,701.437263808737,706.573062245788,711.714725802290,716.862220279104,722.015511873601,727.174567172816,732.339353146739,737.509837141778,742.685986874351,747.867770424643,753.055156230484,758.248113081374,763.446610112640,768.650616799717,773.860102952558,779.075038710167,784.295394535246,789.521141208959,794.752249825813,799.988691788643,805.230438803703,810.477462875864,815.729736303910,820.987231675938,826.249921864843,831.517780023906,836.790779582470,842.068894241700,847.352097970438,852.640365001133,857.933669825857,863.231987192405,868.535292100465,873.843559797866,879.156765776907,884.474885770752,889.797895749890,895.125771918680,900.458490711945,905.796028791646,911.138363043611,916.485470574329,921.837328707805,927.193914982477,932.555207148186,937.921183163208,943.291821191336,948.667099599020,954.046996952560,959.431492015349,964.820563745166,970.214191291518,975.612353993036,981.015031374908,986.422203146368,991.833849198224,997.249949600428,1002.670484599700,1008.095434617182,1013.524780246136,1018.958502249690,1024.396581558614,1029.838999269135,1035.285736640802,1040.736775094367,1046.192096209725,1051.651681723869,1057.115513528895,1062.583573670030,1068.055844343702,1073.532307895633,1079.012946818975,1084.497743752466,1089.986681478623,1095.479742921963,1100.976911147256,1106.478169357801,1111.983500893733,1117.492889230361,1123.006317976526,1128.523770872991,1134.045231790853,1139.570684729985,1145.100113817497,1150.633503306224,1156.170837573243,1161.712101118401,1167.257278562881,1172.806354647776,1178.359314232698,1183.916142294397,1189.476823925413,1195.041344332735,1200.609688836497,1206.181842868674,1211.757791971821,1217.337521797807,1222.921018106589,1228.508266764989,1234.099253745500,1239.693965125102,1245.292387084100,1250.894505904980,1256.500307971276,1262.109779766461,1267.722907872849,1273.339678970516,1278.960079836233,1284.584097342420,1290.211718456111,1295.842930237932,1301.477719841101,1307.116074510435,1312.757981581373,1318.403428479016,1324.052402717178,1329.704891897446,1335.360883708266,1341.020365924026,1346.683326404162,1352.349753092274,1358.019634015255,1363.692957282426,1369.369711084694,1375.049883693712,1380.733463461050,1386.420438817390,1392.110798271714,1397.804530410517,1403.501623897022,1409.202067470413,1414.905849945069,1420.612960209818,1426.323387227193,1432.037120032702,1437.754147734109,1443.474459510716,1449.198044612669,1454.924892360256,1460.654992143229,1466.388333420127,1472.124905717606,1477.864698629786,1483.607701817595,1489.353905008135,1495.103297994044,1500.855870632869,1506.611612846456,1512.370514620334,1518.132566003114,1523.897757105899,1529.666078101692,1535.437519224822,1541.212070770367,1546.989723093589,1552.770466609382,1558.554291791712,1564.341189173078,1570.131149343976,1575.924162952360,1581.720220703125,1587.519313357586,1593.321431732963,1599.126566701879,1604.934709191860,1610.745850184837,1616.559980716662,1622.377091876625,1628.197174806977,1634.020220702460,1639.846220809841,1645.675166427451,1651.507048904734,1657.341859641797,1663.179590088963,1669.020231746336,1674.863776163368,1680.710214938425,1686.559539718372,1692.411742198147,1698.266814120349,1704.124747274832,1709.985533498298,1715.849164673896,1721.715632730830,1727.584929643963,1733.457047433439,1739.331978164291,1745.209713946070,1751.090246932471,1756.973569320959,1762.859673352410,1768.748551310742,1774.640195522568,1780.534598356833,1786.431752224470,1792.331649578052,1798.234282911453,1804.139644759508,1810.047727697677,1815.958524341718,1821.872027347356,1827.788229409963,1833.707123264236,1839.628701683880,1845.552957481295,1851.479883507265,1857.409472650655,1863.341717838103,1869.276612033723,1875.214148238805,1881.154319491525,1887.097118866652,1893.042539475259,1898.990574464439,1904.941217017027,1910.894460351315,1916.850297720780,1922.808722413809,1928.769727753433,1934.733307097051,1940.699453836175,1946.668161396160,1952.639423235951,1958.613232847820,1964.589583757118,1970.568469522019,1976.549883733273,1982.533820013961,1988.520272019245,1994.509233436135,2000.500697983243,2006.494659410550,2012.491111499169,2018.490048061115,2024.491462939077,2030.495350006183,2036.501703165785,2042.510516351228,2048.521783525632,2054.535498681675,2060.551655841373,2066.570249055869,2072.591272405219,2078.614719998180,2084.640585972005,2090.668864492236,2096.699549752497,2102.732635974296,2108.768117406820,2114.805988326742,2120.846243038020,2126.888875871703,2132.933881185739,2138.981253364785,2145.030986820017,2151.083075988941,2157.137515335211,2163.194299348439,2169.253422544021,2175.314879462949,2181.378664671636,2187.444772761740,2193.513198349984,2199.583936077986,2205.656980612087,2211.732326643176,2217.809968886525,2223.889902081621,2229.972120991997,2236.056620405072,2242.143395131985,2248.232440007431,2254.323749889509,2260.417319659554,2266.513144221987,2272.611218504153,2278.711537456173,2284.814096050787,2290.918889283202,2297.025912170944,2303.135159753709,2309.246627093211,2315.360309273044,2321.476201398527,2327.594298596568,2333.714596015519,2339.837088825033,2345.961772215927,2352.088641400042,2358.217691610102,2364.348918099585,2370.482316142582,2376.617881033664,2382.755608087750,2388.895492639976,2395.037530045563,2401.181715679689,2407.328044937358,2413.476513233276,2419.627116001722,2425.779848696426,2431.934706790443,2438.091685776028,2444.250781164521,2450.411988486216,2456.575303290250,2462.740721144482,2468.908237635370,2475.077848367861,2481.249548965272,2487.423335069174,2493.599202339280,2499.777146453331,2505.957163106983,2512.139248013700,2518.323396904638,2524.509605528538,2530.697869651621,2536.888185057474,2543.080547546949,2549.274952938054,2555.471397065849,2561.669875782341,2567.870384956384,2574.072920473571,2580.277478236140,2586.484054162865,2592.692644188961,2598.903244265986,2605.115850361738,2611.330458460160]; diff --git a/udfs/community/t_test.sql b/udfs/community/t_test.sqlx similarity index 90% rename from udfs/community/t_test.sql rename to udfs/community/t_test.sqlx index ed81d7c53..69bd0ca70 100644 --- a/udfs/community/t_test.sql +++ b/udfs/community/t_test.sqlx @@ -1,5 +1,7 @@ +config { hasOutput: true } + /* - * Copyright 2019 Google LLC + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +16,7 @@ * limitations under the License. */ + -- Student's T-Test -- Input: -- pop1, array of FLOAT64, values from first group @@ -22,7 +25,7 @@ -- Struct of t_value and DOF -CREATE OR REPLACE FUNCTION fn.t_test(pop1 ARRAY, pop2 ARRAY) AS (( +CREATE OR REPLACE FUNCTION ${self()}(pop1 ARRAY, pop2 ARRAY) AS (( WITH pop1 AS ( SELECT AVG(p1) x1, STDDEV(p1) st1, COUNT(p1) AS n1 FROM UNNEST(pop1) AS p1 From ec5f181cc48fb999e1cf470c767a1e9d5dd89634 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 12 Oct 2021 18:36:39 +0000 Subject: [PATCH 056/104] Added fix to man --- .../{mannwhitneyu.sql => mannwhitneyu.sqlx} | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) rename udfs/community/{mannwhitneyu.sql => mannwhitneyu.sqlx} (67%) diff --git a/udfs/community/mannwhitneyu.sql b/udfs/community/mannwhitneyu.sqlx similarity index 67% rename from udfs/community/mannwhitneyu.sql rename to udfs/community/mannwhitneyu.sqlx index 7c116c2f5..7af4f46b9 100644 --- a/udfs/community/mannwhitneyu.sql +++ b/udfs/community/mannwhitneyu.sqlx @@ -1,7 +1,25 @@ +config { hasOutput: true } + +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + --Computes the U statistics and the p value of the Mann–Whitney U test (also called Mann–Whitney–Wilcoxon) --inputs: x,y (arrays of samples, both should be one-dimensional, type: ARRAY ) -- alt (Defines the alternative hypothesis. The following options are available: 'two-sided', 'less', and 'greater' -CREATE OR REPLACE FUNCTION fn.mannwhitneyu(x ARRAY, y ARRAY, alt STRING) +CREATE OR REPLACE FUNCTION ${self()}(x ARRAY, y ARRAY, alt STRING) AS ( ( WITH statistics as ( From ce741b46c4bab615db510eec0cb10b45fce4a396 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 12 Oct 2021 18:45:08 +0000 Subject: [PATCH 057/104] pvalue to chi_square_cdf fixes --- udfs/community/kruskal_wallis.sqlx | 2 +- udfs/community/pvalue.sqlx | 15 --------------- 2 files changed, 1 insertion(+), 16 deletions(-) delete mode 100644 udfs/community/pvalue.sqlx diff --git a/udfs/community/kruskal_wallis.sqlx b/udfs/community/kruskal_wallis.sqlx index 61721f08b..cee3e014c 100644 --- a/udfs/community/kruskal_wallis.sqlx +++ b/udfs/community/kruskal_wallis.sqlx @@ -33,5 +33,5 @@ CREATE OR REPLACE FUNCTION ${self()}(data ARRAY>>>>>>> upstream/master:udfs/community/pvalue.sqlx -RETURNS FLOAT64 -LANGUAGE js AS """ - return 1.0 - jstat.jStat['chisquare'].cdf(H, dof) -""" -OPTIONS ( - library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] -); From e2b898dd421416db3f6d0c2d6a6be67c0faf646d Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 12 Oct 2021 18:52:06 +0000 Subject: [PATCH 058/104] fixed test case --- udfs/community/test_cases.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index 9d19f483a..fd76ffda4 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -881,7 +881,7 @@ generate_udf_test("day_occurrence_of_month", [ // // Below targets StatsLib work // -generate_udf_test("pvalue", [ +generate_udf_test("chisquare_cdf", [ { inputs: [ `CAST(0.3 AS FLOAT64)`, From 55e03e810b7e1d0f5c9b4b982d52919eb341bfe9 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:20:05 -0400 Subject: [PATCH 059/104] Update udfs/community/t_test.sqlx Co-authored-by: Daniel De Leo --- udfs/community/t_test.sqlx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udfs/community/t_test.sqlx b/udfs/community/t_test.sqlx index 69bd0ca70..f8ba4aa39 100644 --- a/udfs/community/t_test.sqlx +++ b/udfs/community/t_test.sqlx @@ -19,10 +19,10 @@ config { hasOutput: true } -- Student's T-Test -- Input: - -- pop1, array of FLOAT64, values from first group - -- pop2, array of FLOAT64, values from second group +-- pop1, array of FLOAT64, values from first group +-- pop2, array of FLOAT64, values from second group -- Output: - -- Struct of t_value and DOF +-- Struct of t_value and DOF CREATE OR REPLACE FUNCTION ${self()}(pop1 ARRAY, pop2 ARRAY) AS (( From 70d83c80ebf835a374e1a2360fefaa3995ae4c20 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:20:16 -0400 Subject: [PATCH 060/104] Update udfs/community/t_test.sqlx Co-authored-by: Daniel De Leo --- udfs/community/t_test.sqlx | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/udfs/community/t_test.sqlx b/udfs/community/t_test.sqlx index f8ba4aa39..82060d4d7 100644 --- a/udfs/community/t_test.sqlx +++ b/udfs/community/t_test.sqlx @@ -26,15 +26,22 @@ config { hasOutput: true } CREATE OR REPLACE FUNCTION ${self()}(pop1 ARRAY, pop2 ARRAY) AS (( - WITH pop1 AS ( - SELECT AVG(p1) x1, STDDEV(p1) st1, COUNT(p1) AS n1 FROM UNNEST(pop1) AS p1 - ), - pop2 as ( - SELECT AVG(p2) x2, STDDEV(p2) st2, COUNT(p2) AS n2 FROM UNNEST(pop2) AS p2 + SELECT + AVG(p1) x1, + STDDEV(p1) st1, + COUNT(p1) AS n1 + FROM UNNEST(pop1) AS p1 + ), pop2 as ( + SELECT + AVG(p2) x2, + STDDEV(p2) st2, + COUNT(p2) AS n2 + FROM UNNEST(pop2) AS p2 ) - SELECT - STRUCT(ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2/ n2)) AS t_value, n1+n2-2 AS dof) - FROM pop1 CROSS JOIN pop2 - -)); \ No newline at end of file + SELECT + STRUCT( + ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)) AS t_value, + n1 + n2 - 2 AS dof) + FROM pop1 CROSS JOIN pop2 +)); From 9e3716c549b061d8a857aa162e68c33a5f2dccae Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:20:51 -0400 Subject: [PATCH 061/104] Update udfs/community/p_fisherexact.sqlx Co-authored-by: Daniel De Leo --- udfs/community/p_fisherexact.sqlx | 1 - 1 file changed, 1 deletion(-) diff --git a/udfs/community/p_fisherexact.sqlx b/udfs/community/p_fisherexact.sqlx index c2291bf3d..87438273e 100644 --- a/udfs/community/p_fisherexact.sqlx +++ b/udfs/community/p_fisherexact.sqlx @@ -82,7 +82,6 @@ function LynHyperGoe_appr( a, b, c, d ) { else {temp = temp - lnfact(d);} return( temp ); - } var n = Math['round'](a + b + c + d); From 021a6235353c18fc25e29555da34a62040574da0 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:20:58 -0400 Subject: [PATCH 062/104] Update udfs/community/p_fisherexact.sqlx Co-authored-by: Daniel De Leo --- udfs/community/p_fisherexact.sqlx | 1 - 1 file changed, 1 deletion(-) diff --git a/udfs/community/p_fisherexact.sqlx b/udfs/community/p_fisherexact.sqlx index 87438273e..d090e6fb4 100644 --- a/udfs/community/p_fisherexact.sqlx +++ b/udfs/community/p_fisherexact.sqlx @@ -125,7 +125,6 @@ function LynHyperGoe_appr( a, b, c, d ) { } if ( LnPrx <= LnPra ) { temp = temp + Math['exp']( LnPrx - LnPra ); - } } } From 6a776527a3374af315479cd3fe0b78ca947105d2 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:21:15 -0400 Subject: [PATCH 063/104] Update udfs/community/p_fisherexact.sqlx Co-authored-by: Daniel De Leo --- udfs/community/p_fisherexact.sqlx | 1 - 1 file changed, 1 deletion(-) diff --git a/udfs/community/p_fisherexact.sqlx b/udfs/community/p_fisherexact.sqlx index d090e6fb4..e487d6ed1 100644 --- a/udfs/community/p_fisherexact.sqlx +++ b/udfs/community/p_fisherexact.sqlx @@ -106,7 +106,6 @@ function LynHyperGoe_appr( a, b, c, d ) { LnPrx = LnHyperGeometric( x , n1_ - x, n_1 - x, n_21 +x) ; if ( LnPrx <= LnPra ) { temp = temp + Math['exp']( LnPrx - LnPra ); - } } } else { From 7823f315fab2f0d7c0cb6ab25f0ba23bff46645f Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:21:21 -0400 Subject: [PATCH 064/104] Update udfs/community/p_fisherexact.sqlx Co-authored-by: Daniel De Leo --- udfs/community/p_fisherexact.sqlx | 1 - 1 file changed, 1 deletion(-) diff --git a/udfs/community/p_fisherexact.sqlx b/udfs/community/p_fisherexact.sqlx index e487d6ed1..ca5ea3e58 100644 --- a/udfs/community/p_fisherexact.sqlx +++ b/udfs/community/p_fisherexact.sqlx @@ -117,7 +117,6 @@ function LynHyperGoe_appr( a, b, c, d ) { for (x = min+1; x <= max; x++) { if (! (x % 10 == 0) ) { LnPrx = LnPrx + Math['log']( ((n_1 - x +1)/x)*(n1_ -x +1)/( n_21 + x) ) ; - } else { LnPrx = LynHyperGoe_appr(x, n1_-x, n_1-x, n_21+x); From e713b9103a47a3a0286932a4da2a556441bf8636 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:21:27 -0400 Subject: [PATCH 065/104] Update udfs/community/p_fisherexact.sqlx Co-authored-by: Daniel De Leo --- udfs/community/p_fisherexact.sqlx | 1 - 1 file changed, 1 deletion(-) diff --git a/udfs/community/p_fisherexact.sqlx b/udfs/community/p_fisherexact.sqlx index ca5ea3e58..e45245236 100644 --- a/udfs/community/p_fisherexact.sqlx +++ b/udfs/community/p_fisherexact.sqlx @@ -129,5 +129,4 @@ function LynHyperGoe_appr( a, b, c, d ) { var LnPFET = LnPra + Math['log']( temp ); return Math['exp']( LnPFET ) ; - """; From 093a50a0515043a1693752d9b123c564a109d352 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 26 Oct 2021 14:22:25 -0400 Subject: [PATCH 066/104] Delete cloudbuild_js_libs.yaml --- udfs/cloudbuild_js_libs.yaml | 66 ------------------------------------ 1 file changed, 66 deletions(-) delete mode 100644 udfs/cloudbuild_js_libs.yaml diff --git a/udfs/cloudbuild_js_libs.yaml b/udfs/cloudbuild_js_libs.yaml deleted file mode 100644 index 55ce5b8bf..000000000 --- a/udfs/cloudbuild_js_libs.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# Builds js_libs to your own GCS bucket for testing -# gcloud builds submit . --config=build_js_libs.yaml --substitutions=_JS_BUCKET="gs://YOUR_GS_BUCKET[/optional_sub_path]" -# - -steps: - ############################################################ - # Dynamically create the package.json file based off the libs - # specified in the js_libs/js_libs.yaml file. - ############################################################ -- name: gcr.io/bqutil/bq_udf_ci - id: generate_js_libs_package_json - entrypoint: python3 - args: - - tests/udf_test_utils.py - - --generate-js-libs-package-json - ########################################################### - # Install npm packages based off the package.json file - # created in the previous step. - ########################################################### -- name: node - id: install_npm_packages - entrypoint: npm - args: - - install - ############################################################ - # Dynamically create webpack config files needed by webpack - # to build npm packages into single .js files which will be - # hosted on GCS and used by BigQuery UDFs. - ############################################################ -- name: gcr.io/bqutil/bq_udf_ci - id: generate_webpack_configs - entrypoint: python3 - args: - - tests/udf_test_utils.py - - --generate-webpack-configs - waitFor: - - install_npm_packages - - generate_js_libs_package_json - ########################################################### - # Build (via webpack) all js libraries for BigQuery UDFs - ########################################################### -- name: node - id: build_bq_js_libs - entrypoint: npm - args: - - run-script - - build-all-libs - waitFor: - - generate_webpack_configs - - install_npm_packages - ########################################################### - # Copy all libs to GCS bucket - ########################################################### -- name: gcr.io/google.com/cloudsdktool/cloud-sdk - id: copy_js_to_gcs - entrypoint: gsutil - args: - - '-m' - - cp - - js_builds/* - - ${_JS_BUCKET} - waitFor: - - build_bq_js_libs -timeout: 1800s # 30 minutes -options: - machineType: N1_HIGHCPU_32 From defd0864e177ce3e4d97c81aeb6b89d56a2ada6e Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 28 Oct 2021 13:20:43 +0000 Subject: [PATCH 067/104] Added MannWhitney and Fisher tests. --- udfs/community/test_cases.js | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index bf687d4b0..abf82643b 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -902,6 +902,19 @@ generate_udf_test("linear_regression", [ expected_output: `STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r)` }, ]); +generate_udf_test("p_fisherexact", [ + { + inputs: [`(SELECT CAST(90 AS FLOAT64), CAST(27 AS FLOAT64), CAST(17 AS FLOAT64), CAST(50 AS FLOAT64))`], + expected_output: `CAST(8.046828829103659E-12 AS FLOAT64)` + }, +]); +generate_udf_test("mannwhitneyu", [ + { + inputs: [`(SELECT ARRAY[2, 4, 6, 2, 3, 7, 5, 1.], ARRAY[8, 10, 11, 14, 20, 18, 19, 9.], CAST('two-sided' AS STRING))`], + + expected_output: `STRUCT(CAST(64.0 AS FLOAT64) AS U, CAST(9.391056991171487E-4 AS FLOAT64) AS p)` + }, +]); // // End of StatsLib work tests // From ab7a8f31b872d8d559937e72ede37c6c556473d2 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Mon, 8 Nov 2021 21:35:45 -0800 Subject: [PATCH 068/104] Update README.md --- udfs/community/README.md | 98 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 6 deletions(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 725fe3caf..40397bb00 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -14,6 +14,7 @@ SELECT bqutil.fn.int(1.684) ## UDFs * [chisquare_cdf](#chisquare_cdfh-float64-dof-float64) +* [corr_pvalue](#corr_pvaluer-float64-n-int64) * [csv_to_struct](#csv_to_structstrlist-string) * [day_occurrence_of_month](#day_occurrence_of_monthdate_expression-any-type) * [degrees](#degreesx-any-type) @@ -34,6 +35,7 @@ SELECT bqutil.fn.int(1.684) * [levenshtein](#levenshteinsource-string-target-string-returns-int64) * [linear_interpolate](#linear_interpolatepos-int64-prev-structx-int64-y-float64-next-structx-int64-y-float64) * [linear_regression](#linear_regressionarraystructstructx-float64-y-float64) +* mannwhitneyu * [median](#medianarr-any-type) * [nlp_compromise_number](#nlp_compromise_numberstr-string) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) @@ -41,6 +43,7 @@ SELECT bqutil.fn.int(1.684) * [percentage_difference](#percentage_differenceval1-float64-val2-float64) * [pi](#pi) * [pvalue](#pvalueh-float64-dof-float64) +* p_fisherexact * [radians](#radiansx-any-type) * [random_int](#random_intmin-any-type-max-any-type) * [random_string](#random_stringlength-int64) @@ -813,11 +816,47 @@ SELECT bqutil.fn.int(1.684) ``` ## UDFs - +* [corr_pvalue](#corr_pvaluer-float64-n-int64) * [kruskal_wallis](#kruskal_wallisarrstructfactor-string-val-float64) +* linear_regression +* pvalue +* p_fisherexact +* mannwhitneyu +* t_test ## Documentation +### [corr_pvalue(r FLOAT64, n INT64)](corr_pvalue.sqlx) +The returns the p value of the computed correlation coefficient based on the t-distribution. +Input: +r: correlation value. +n: number of samples. +Output: +The p value of the correlation coefficient. +```sql +WITH test_cases AS ( + SELECT 0.9 AS r, 25 n + UNION ALL + SELECT -0.5, 40 + UNION ALL + SELECT 1.0, 50 + UNION ALL + SELECT -1.0, 50 +) +SELECT bqutil.fn.corr_pvalue(r,n) AS p +FROM test_cases +``` + +results: + +| p | +|-----| +| 1.443229117741041E-9 | +| 0.0010423414457657223 | +| 0.0 | +| 0.0 | +----- + ### [kruskal_wallis(ARRAY(STRUCT(factor STRING, val FLOAT64))](kruskal_wallis.sqlx) Takes an array of struct where each struct (point) represents a measurement, with a group label and a measurement value @@ -852,8 +891,7 @@ results: | results.H | results.p | results.DoF | |-----------|-----------|-------------| | 3.4230769 | 0.1805877 | 2 | - - +----- ### [linear_regression(ARRAY(STRUCT(STRUCT(X FLOAT64, Y FLOAT64))](linear_regression.sqlx) Takes an array of STRUCT X, Y and returns _a, b, r_ where _Y = a*X + b_, and _r_ is the "goodness of fit measure. @@ -875,9 +913,7 @@ results: | results.a | results.b | results.r | |---------------------|--------------------|-------------------| | -0.4353361094588436 | 0.5300416418798544 | 0.632366563565354 | - - - +----- ### [pvalue(H FLOAT64, dof FLOAT64)](pvalue.sqlx) Takes _H_ and _dof_ and returns _p_ probability value. @@ -897,6 +933,56 @@ results: | results | |-------------------| |0.8607079764250578 | +----- + +### [p_fisherexact(a FLOAT64, b FLOAT64, c FLOAT64, d FLOAT64)](p_fisherexact.sqlx) +Computes the p value of the Fisher exact test (https://en.wikipedia.org/wiki/Fisher%27s_exact_test), implemented in JavaScript. + +- **Input:** a,b,c,d : values of 2x2 contingency table ([ [ a, b ] ;[ c , d ] ] (type FLOAT64). +- **Output:** The p value of the test (type: FLOAT64) + +Example +```SQL +WITH mydata as ( +SELECT + 90.0 as a, + 27.0 as b, + 17.0 as c, + 50.0 as d +) +SELECT + `bqutils.fn.p_fisherexact`(a,b,c,d) as pvalue +FROM + mydata +``` + +Output: +| pvalue | +|---| +| 8.046828829103659E-12 | +----- + +### [mannwhitneyu(x ARRAY, y ARRAY, alt STRING)](mannwhitneyu.sqlx) +Computes the U statistics and the p value of the Mann–Whitney U test (https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test). This test is also called the Mann–Whitney–Wilcoxon (MWW), Wilcoxon rank-sum test, or Wilcoxon–Mann–Whitney test + +- **Input:** x,y :arrays of samples, both should be one-dimensional (type: ARRAY ), alt: defines the alternative hypothesis, the following options are available: 'two-sided', 'less', and 'greater'. +- **Output:** structure of the type struct where U is the statistic and p is the p value of the test. + +Example +``` +WITH mydata AS ( + SELECT + [2, 4, 6, 2, 3, 7, 5, 1.] AS x, + [8, 10, 11, 14, 20, 18, 19, 9. ] AS y +) +SELECT `bqutils.fn.mannwhitneyu`(y, x, 'two-sided') AS test +FROM mydata +``` + +Output: +| test.U | test.p | +|---|---| +| 0.0 | 9.391056991171487E-4 | ----- ### [t_test(ARRAY,ARRAY)](t_test.sql) From 9d0268eea686ab78da57377adeaf2cfe0df79aac Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Mon, 8 Nov 2021 21:45:41 -0800 Subject: [PATCH 069/104] Update README.md --- udfs/community/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 40397bb00..475ea3fca 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -35,7 +35,7 @@ SELECT bqutil.fn.int(1.684) * [levenshtein](#levenshteinsource-string-target-string-returns-int64) * [linear_interpolate](#linear_interpolatepos-int64-prev-structx-int64-y-float64-next-structx-int64-y-float64) * [linear_regression](#linear_regressionarraystructstructx-float64-y-float64) -* mannwhitneyu +* [mannwhitneyu](#mannwhitneyux-array-y-array-alt-string) * [median](#medianarr-any-type) * [nlp_compromise_number](#nlp_compromise_numberstr-string) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) @@ -43,7 +43,7 @@ SELECT bqutil.fn.int(1.684) * [percentage_difference](#percentage_differenceval1-float64-val2-float64) * [pi](#pi) * [pvalue](#pvalueh-float64-dof-float64) -* p_fisherexact +* [p_fisherexact](#p_fisherexacta-float64-b-float64-c-float64-d-float64) * [radians](#radiansx-any-type) * [random_int](#random_intmin-any-type-max-any-type) * [random_string](#random_stringlength-int64) @@ -817,12 +817,12 @@ SELECT bqutil.fn.int(1.684) ## UDFs * [corr_pvalue](#corr_pvaluer-float64-n-int64) -* [kruskal_wallis](#kruskal_wallisarrstructfactor-string-val-float64) -* linear_regression -* pvalue -* p_fisherexact -* mannwhitneyu -* t_test +* [kruskal_wallis](#kruskal_wallisarraystructfactor-string-val-float64) +* [linear_regression](#linear_regressionarraystructstructx-float64-y-float64) +* [pvalue](#pvalueh-float64-dof-float64) +* [p_fisherexact](#p_fisherexacta-float64-b-float64-c-float64-d-float64) +* [mannwhitneyu](#mannwhitneyux-array-y-array-alt-string) +* [t_test](t_testarrayarray) ## Documentation From 9babfd0843b41c7253ba09e17bd6a0c707602da6 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Mon, 8 Nov 2021 21:48:49 -0800 Subject: [PATCH 070/104] Update README.md --- udfs/community/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 475ea3fca..b46cb0099 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -822,7 +822,7 @@ SELECT bqutil.fn.int(1.684) * [pvalue](#pvalueh-float64-dof-float64) * [p_fisherexact](#p_fisherexacta-float64-b-float64-c-float64-d-float64) * [mannwhitneyu](#mannwhitneyux-array-y-array-alt-string) -* [t_test](t_testarrayarray) +* [t_test](#t_testarrayarray) ## Documentation From 3fdc93b7568a766ea7c2e4fc3f7ed8e0267a9a45 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Mon, 8 Nov 2021 22:01:38 -0800 Subject: [PATCH 071/104] test of statistical udfs --- udfs/community/test_cases.js | 39 ++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index abf82643b..902bd1313 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -902,17 +902,48 @@ generate_udf_test("linear_regression", [ expected_output: `STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r)` }, ]); +generate_udf_test("corr_pvalue", [ + { + inputs: [ + `CAST(0.9 AS FLOAT64)`, + `CAST(25 AS INT64)` + ], + expected_output: `CAST(1.443229117741041E-9 AS FLOAT64)` + }, + { + inputs: [ + `CAST(-0.5 AS FLOAT64)`, + `CAST(40 AS INT64)` + ], + expected_output: `CAST(0.0010423414457657223 AS FLOAT64)` + }, + { + inputs: [ + `CAST(1.0 AS FLOAT64)`, + `CAST(50 AS INT64)` + ], + expected_output: `CAST(0.0 AS FLOAT64)` + }, +]); generate_udf_test("p_fisherexact", [ { - inputs: [`(SELECT CAST(90 AS FLOAT64), CAST(27 AS FLOAT64), CAST(17 AS FLOAT64), CAST(50 AS FLOAT64))`], + inputs: [ + `CAST(90 AS FLOAT64)`, + `CAST(27 AS FLOAT64)`, + `CAST(17 AS FLOAT64)`, + `CAST(50 AS FLOAT64)` + ], expected_output: `CAST(8.046828829103659E-12 AS FLOAT64)` }, ]); generate_udf_test("mannwhitneyu", [ { - inputs: [`(SELECT ARRAY[2, 4, 6, 2, 3, 7, 5, 1.], ARRAY[8, 10, 11, 14, 20, 18, 19, 9.], CAST('two-sided' AS STRING))`], - - expected_output: `STRUCT(CAST(64.0 AS FLOAT64) AS U, CAST(9.391056991171487E-4 AS FLOAT64) AS p)` + inputs: [ + `(SELECT ARRAY[2, 4, 6, 2, 3, 7, 5, 1.])`, + `(SELECT ARRAY[8, 10, 11, 14, 20, 18, 19, 9.])`, + `CAST('two-sided' AS STRING)` + ], + expected_output: `STRUCT(CAST(64.0 AS FLOAT64) AS U, CAST(9.391056991171487E-4 AS FLOAT64) AS p)` }, ]); // From 2d05271ae060b71885071cf7a56c8014f42eff68 Mon Sep 17 00:00:00 2001 From: Boris Aguilar Date: Mon, 8 Nov 2021 22:03:04 -0800 Subject: [PATCH 072/104] updated udf --- udfs/community/mannwhitneyu.sqlx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udfs/community/mannwhitneyu.sqlx b/udfs/community/mannwhitneyu.sqlx index 7af4f46b9..ececb31c7 100644 --- a/udfs/community/mannwhitneyu.sqlx +++ b/udfs/community/mannwhitneyu.sqlx @@ -52,6 +52,6 @@ AS ( IF( alt='less' OR alt='greater', 1.0, 2.0 ) as factor FROM statistics ) - SELECT struct(U, factor* fn.normal_cdf(z,0.0,1.0) as p ) + SELECT struct(U, factor* ${ref("normal_cdf")}(z,0.0,1.0) as p ) FROM normal_appr -)); \ No newline at end of file +)); From 2ff252c71e63af4d6f915d8533c5fc1ffd437d82 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:33:08 -0500 Subject: [PATCH 073/104] Update stored_procedures/bh_multiple_tests.sql Co-authored-by: Daniel De Leo --- stored_procedures/bh_multiple_tests.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/stored_procedures/bh_multiple_tests.sql b/stored_procedures/bh_multiple_tests.sql index eba354548..cf2a74d28 100644 --- a/stored_procedures/bh_multiple_tests.sql +++ b/stored_procedures/bh_multiple_tests.sql @@ -73,4 +73,3 @@ BEGIN END; - From ecc287641ca9859afafcfd341d3f434ecbe89ecf Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:33:18 -0500 Subject: [PATCH 074/104] Update udfs/community/t_test.sqlx Co-authored-by: Daniel De Leo --- udfs/community/t_test.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/t_test.sqlx b/udfs/community/t_test.sqlx index 82060d4d7..d2a89b5e9 100644 --- a/udfs/community/t_test.sqlx +++ b/udfs/community/t_test.sqlx @@ -45,3 +45,4 @@ CREATE OR REPLACE FUNCTION ${self()}(pop1 ARRAY, pop2 ARRAY) A n1 + n2 - 2 AS dof) FROM pop1 CROSS JOIN pop2 )); + From 5e142462d9f118858d24a31639d3d93dcda01518 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:33:30 -0500 Subject: [PATCH 075/104] Update udfs/community/p_fisherexact.sqlx Co-authored-by: Daniel De Leo --- udfs/community/p_fisherexact.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/p_fisherexact.sqlx b/udfs/community/p_fisherexact.sqlx index e45245236..ad811d4cf 100644 --- a/udfs/community/p_fisherexact.sqlx +++ b/udfs/community/p_fisherexact.sqlx @@ -130,3 +130,4 @@ function LynHyperGoe_appr( a, b, c, d ) { var LnPFET = LnPra + Math['log']( temp ); return Math['exp']( LnPFET ) ; """; + From 118ae005572d27b4fd63140e0261d2aea57c4b80 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:33:40 -0500 Subject: [PATCH 076/104] Update udfs/community/normal_cdf.sqlx Co-authored-by: Daniel De Leo --- udfs/community/normal_cdf.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/normal_cdf.sqlx b/udfs/community/normal_cdf.sqlx index 56371ad8c..c7a671337 100644 --- a/udfs/community/normal_cdf.sqlx +++ b/udfs/community/normal_cdf.sqlx @@ -24,3 +24,4 @@ LANGUAGE js AS """ OPTIONS ( library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] ); + From d3fee52c1dd7f5076c4b2665f02e1f91d4af317a Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:33:48 -0500 Subject: [PATCH 077/104] Update udfs/community/mannwhitneyu.sqlx Co-authored-by: Daniel De Leo --- udfs/community/mannwhitneyu.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/mannwhitneyu.sqlx b/udfs/community/mannwhitneyu.sqlx index ececb31c7..3460b2165 100644 --- a/udfs/community/mannwhitneyu.sqlx +++ b/udfs/community/mannwhitneyu.sqlx @@ -55,3 +55,4 @@ AS ( SELECT struct(U, factor* ${ref("normal_cdf")}(z,0.0,1.0) as p ) FROM normal_appr )); + From 7598414e40ae8d6a5feedf6df6924fc9653595b7 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:33:56 -0500 Subject: [PATCH 078/104] Update udfs/community/kruskal_wallis.sqlx Co-authored-by: Daniel De Leo --- udfs/community/kruskal_wallis.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/kruskal_wallis.sqlx b/udfs/community/kruskal_wallis.sqlx index cee3e014c..90156134b 100644 --- a/udfs/community/kruskal_wallis.sqlx +++ b/udfs/community/kruskal_wallis.sqlx @@ -35,3 +35,4 @@ CREATE OR REPLACE FUNCTION ${self()}(data ARRAY Date: Thu, 9 Dec 2021 11:34:11 -0500 Subject: [PATCH 079/104] Update udfs/community/jstat.sqlx Co-authored-by: Daniel De Leo --- udfs/community/jstat.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/jstat.sqlx b/udfs/community/jstat.sqlx index 9390500a2..024acee95 100644 --- a/udfs/community/jstat.sqlx +++ b/udfs/community/jstat.sqlx @@ -30,3 +30,4 @@ LANGUAGE js AS """ OPTIONS ( library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] ); + From 73e5c8a67f32871ccb7fe1e07961c460c9c977dd Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:34:32 -0500 Subject: [PATCH 080/104] Update udfs/community/corr_pvalue.sqlx Co-authored-by: Daniel De Leo --- udfs/community/corr_pvalue.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/corr_pvalue.sqlx b/udfs/community/corr_pvalue.sqlx index e6ebc4e0c..5de8ac5e4 100644 --- a/udfs/community/corr_pvalue.sqlx +++ b/udfs/community/corr_pvalue.sqlx @@ -35,3 +35,4 @@ if ( abs_r < 1.0 ) { } """ OPTIONS (library=["${JS_BUCKET}/jstat-v1.9.4.min.js"]); + From e841e67e70373b8077cbc10b031f48b17c6f714e Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:34:43 -0500 Subject: [PATCH 081/104] Update udfs/community/chisquare_cdf.sqlx Co-authored-by: Daniel De Leo --- udfs/community/chisquare_cdf.sqlx | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/chisquare_cdf.sqlx b/udfs/community/chisquare_cdf.sqlx index 5f291a74e..c23a8115e 100644 --- a/udfs/community/chisquare_cdf.sqlx +++ b/udfs/community/chisquare_cdf.sqlx @@ -24,3 +24,4 @@ LANGUAGE js AS """ OPTIONS ( library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] ); + From 056cd55cd08da1ef590453ca006c1165edea7f61 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:34:52 -0500 Subject: [PATCH 082/104] Update stored_procedures/linear_regression.sql Co-authored-by: Daniel De Leo --- stored_procedures/linear_regression.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stored_procedures/linear_regression.sql b/stored_procedures/linear_regression.sql index 82de015b3..b0cf4d666 100644 --- a/stored_procedures/linear_regression.sql +++ b/stored_procedures/linear_regression.sql @@ -201,4 +201,4 @@ BEGIN ASSERT ROUND(result.a, 11) = 3.11519268710; ASSERT ROUND(result.b, 11) = -0.62754617565; ASSERT ROUND(result.r, 11) = -0.35654408961; -END; \ No newline at end of file +END; From fcce8308da9bfe3d3162453eac09ce6005999c1b Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:35:03 -0500 Subject: [PATCH 083/104] Update stored_procedures/chi_square.sql Co-authored-by: Daniel De Leo --- stored_procedures/chi_square.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/stored_procedures/chi_square.sql b/stored_procedures/chi_square.sql index 40ec06a58..6a6db3cf2 100644 --- a/stored_procedures/chi_square.sql +++ b/stored_procedures/chi_square.sql @@ -62,3 +62,4 @@ BEGIN ASSERT result.dof = 1; ASSERT result.p = 0.052958181867438725; END; + From 72db4f510d769a45e1700531313e655559176899 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 9 Dec 2021 20:38:53 +0000 Subject: [PATCH 084/104] Added license header --- stored_procedures/bh_multiple_tests.sql | 16 ++++++++++++++++ stored_procedures/chi_square.sql | 16 ++++++++++++++++ stored_procedures/linear_regression.sql | 16 ++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/stored_procedures/bh_multiple_tests.sql b/stored_procedures/bh_multiple_tests.sql index cf2a74d28..d405bd930 100644 --- a/stored_procedures/bh_multiple_tests.sql +++ b/stored_procedures/bh_multiple_tests.sql @@ -1,3 +1,19 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + -- Adjust p values using the Benjamini-Hochberg multipletests method, additional details in doi:10.1098/rsta.2009.0127 -- the implementation can be compared with the python function 'statsmodels.stats.multitest.multipletests' (method='fdr_bh') diff --git a/stored_procedures/chi_square.sql b/stored_procedures/chi_square.sql index 6a6db3cf2..14371e46c 100644 --- a/stored_procedures/chi_square.sql +++ b/stored_procedures/chi_square.sql @@ -1,3 +1,19 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + -- @param STRING table_name table (or subquery) that contains the data -- @param STRING independent_var name of the column in our table that represents our independent variable -- @param STRING dependent_var name of the column in our table that represents our dependent variable diff --git a/stored_procedures/linear_regression.sql b/stored_procedures/linear_regression.sql index b0cf4d666..cc156792d 100644 --- a/stored_procedures/linear_regression.sql +++ b/stored_procedures/linear_regression.sql @@ -1,3 +1,19 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + -- @param STRING table_name table (or subquery) that contains the data -- @param STRING independent_var name of the column in our table that represents our independent variable -- @param STRING dependent_var name of the column in our table that represents our dependent variable From 82e62291bf72312bffbfc3d156b77c80626074a0 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 9 Dec 2021 21:08:11 +0000 Subject: [PATCH 085/104] Added linear regression --- stored_procedures/README.md | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/stored_procedures/README.md b/stored_procedures/README.md index 7e2d8f07c..e8f24e020 100644 --- a/stored_procedures/README.md +++ b/stored_procedures/README.md @@ -15,6 +15,9 @@ CALL bqutil.procedure.GetNextIds(10, next_ids); ## Stored Procedures * [GetNextIds](#GetNextIds) +* [chi_square](#chi_square) +* [Benjamini-Hochberg multipletests](#bh_multiple_tests) +* [Linear Regression](#linear_regression) ## Documentation @@ -90,3 +93,42 @@ Output: | 0.074 | 0.08457142857142856 | | 0.205 | 0.205 | +### [linear_regression (table_name STRING, independent_var STRING, dependent_var STRING, OUT result STRUCT )](linear_regression.sql) +Run a standard linear regression on table data. Expects a table and two columns: the independent variable and the dependent variable. The output is a STRUCT with the slope (`a`), the intercept (`b`) and the correlation value (`r`). + +> Input data + +The unit test for this procedure builds a TEMP table to contain the classic [Iris flower data set](https://en.wikipedia.org/wiki/Iris_flower_data_set). This dataset contains 150 data points, not all shown below. The sample call demonstrates how to access the output. + +```sql +-- a unit test of linear_regression +BEGIN + DECLARE result STRUCT; + CREATE TEMP TABLE iris (sepal_length FLOAT64, sepal_width FLOAT64, petal_length FLOAT64, petal_width FLOAT64, species STRING) + AS + SELECT 5.1 AS sepal_length, + 3.5 AS sepal_width, + 1.4 AS petal_length, + 0.2 AS petal_width, + 'setosa' AS species + UNION ALL SELECT 4.9,3.0,1.4,0.2,'setosa' + UNION ALL SELECT 4.7,3.2,1.3,0.2,'setosa' + ... + UNION ALL SELECT 6.5,3.0,5.2,2.0,'virginica' + UNION ALL SELECT 6.2,3.4,5.4,2.3,'virginica' + UNION ALL SELECT 5.9,3.0,5.1,1.8,'virginica'; +``` + +```sql +CALL bqutil.procedure.linear_regression('iris', 'sepal_width', 'petal_width', result); + + -- We round to 11 decimals here because there appears to be some inconsistency in the function, likely due to floating point errors and the order of aggregation + ASSERT ROUND(result.a, 11) = 3.11519268710; + ASSERT ROUND(result.b, 11) = -0.62754617565; + ASSERT ROUND(result.r, 11) = -0.35654408961; +END; +``` + +Output: + +`This assertion was successful` \ No newline at end of file From 5321ad1aeba185e60e34593f7f03a908ae3ca04d Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 9 Dec 2021 21:14:19 +0000 Subject: [PATCH 086/104] fixed index links. --- stored_procedures/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stored_procedures/README.md b/stored_procedures/README.md index e8f24e020..8a8a5cf87 100644 --- a/stored_procedures/README.md +++ b/stored_procedures/README.md @@ -14,10 +14,10 @@ CALL bqutil.procedure.GetNextIds(10, next_ids); ## Stored Procedures -* [GetNextIds](#GetNextIds) -* [chi_square](#chi_square) -* [Benjamini-Hochberg multipletests](#bh_multiple_tests) -* [Linear Regression](#linear_regression) +* [GetNextIds](#getnextidsid_count-int64-out-next_ids-array) +* [chi_square](#chi_squaretable_name-string-independent_var-string-dependent_var-string-out-result-structx-float64-dof-float64-p-float64) +* [Benjamini-Hochberg multipletests](#bh_multiple_tests-pvalue_table_name-string-pvalue_column_name-string-n_rows-int64-temp_table_name-string-) +* [Linear Regression](#linear_regression-table_name-string-independent_var-string-dependent_var-string-out-result-structa-float64-b-float64-r-float64-) ## Documentation From 7eb70f5e616a4a7fa162b52568d9dcdc16fc9460 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Fri, 10 Dec 2021 14:26:40 +0000 Subject: [PATCH 087/104] fixed index names --- stored_procedures/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stored_procedures/README.md b/stored_procedures/README.md index 8a8a5cf87..78a25dea1 100644 --- a/stored_procedures/README.md +++ b/stored_procedures/README.md @@ -16,8 +16,8 @@ CALL bqutil.procedure.GetNextIds(10, next_ids); * [GetNextIds](#getnextidsid_count-int64-out-next_ids-array) * [chi_square](#chi_squaretable_name-string-independent_var-string-dependent_var-string-out-result-structx-float64-dof-float64-p-float64) -* [Benjamini-Hochberg multipletests](#bh_multiple_tests-pvalue_table_name-string-pvalue_column_name-string-n_rows-int64-temp_table_name-string-) -* [Linear Regression](#linear_regression-table_name-string-independent_var-string-dependent_var-string-out-result-structa-float64-b-float64-r-float64-) +* [bh_multiple_tests](#bh_multiple_tests-pvalue_table_name-string-pvalue_column_name-string-n_rows-int64-temp_table_name-string-) +* [linear_regression](#linear_regression-table_name-string-independent_var-string-dependent_var-string-out-result-structa-float64-b-float64-r-float64-) ## Documentation From 7ee73331e090166c018e18b08e9e8372694d62ba Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 21 Dec 2021 20:01:38 +0000 Subject: [PATCH 088/104] Added Unit tests. --- udfs/community/README.md | 20 +++++++------------- udfs/community/jstat.sqlx | 33 --------------------------------- udfs/community/test_cases.js | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 46 deletions(-) delete mode 100644 udfs/community/jstat.sqlx diff --git a/udfs/community/README.md b/udfs/community/README.md index b46cb0099..94dc2f6f4 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -985,25 +985,19 @@ Output: | 0.0 | 9.391056991171487E-4 | ----- -### [t_test(ARRAY,ARRAY)](t_test.sql) +### [normal_cdf(ARRAY,ARRAY)](t_test.sql) -Runs the Student's T-test. Well known test to compare populations. Example taken from here: [Sample](https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test/two-sample-t-test.html) +Returns the value of x in the cdf of the Normal distribution with parameters mean and std (standard deviation). Sample Query: ```SQL -DECLARE pop1 ARRAY; -DECLARE pop2 ARRAY; - -SET pop1 = [13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0]; -SET pop2 = [22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0] ; - -SELECT `bqutils.fn.t_test`(pop1, pop2) AS actual_result_rows; - +SELECT `bigquery-utils-stat.fn.normal_cdf`(1.1, 1.7, 2.0) as normal_cdf; ``` Results: -| Row | actual_result_rows.t_value | actual_result_rows.dof| -|-----|----------------------------|-----------------------| -| 1 | 2.8957935572829476 | 21 +| Row | normal_cdf | +|-----|-------------------| +| 1 | 0.3820885778110474 | + \ No newline at end of file diff --git a/udfs/community/jstat.sqlx b/udfs/community/jstat.sqlx deleted file mode 100644 index 024acee95..000000000 --- a/udfs/community/jstat.sqlx +++ /dev/null @@ -1,33 +0,0 @@ -config { hasOutput: true } - -/* - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CREATE OR REPLACE FUNCTION ${self()}(method STRING, args ARRAY) -RETURNS FLOAT64 -LANGUAGE js AS """ - const methodPath = method['split']('.') - let fn = jstat['jStat'] - for (const name of methodPath){ - fn = fn[name] - } - - return fn(...args) -""" -OPTIONS ( - library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] -); - diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index 902bd1313..01d1f3636 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -936,6 +936,15 @@ generate_udf_test("p_fisherexact", [ expected_output: `CAST(8.046828829103659E-12 AS FLOAT64)` }, ]); +generate_udf_test("t_test", [ + { + inputs: [ + `(SELECT ARRAY[13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0])`, + `(SELECT ARRAY[22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0])` + ], + expected_output: `STRUCT(CAST(2.8957935572829476 AS FLOAT64) AS t_value, CAST(21 AS INTEGER) AS dof)` + }, +]); generate_udf_test("mannwhitneyu", [ { inputs: [ @@ -947,6 +956,17 @@ generate_udf_test("mannwhitneyu", [ }, ]); // +generate_udf_test("normal_cdf", [ + { + inputs: [ + `CAST(1.1 AS FLOAT64)`, + `CAST(1.7 AS FLOAT64)`, + `CAST(2.0 AS FLOAT64)` + ], + expected_output: `CAST(0.3820885778110474 AS FLOAT64)` + }, +]); +// // End of StatsLib work tests // generate_udf_test("jaccard", [ From 663ce64777ae09cd5fc3075ac2f64dd2b87d854b Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 21 Dec 2021 20:03:43 +0000 Subject: [PATCH 089/104] Fixed Link --- udfs/community/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/udfs/community/README.md b/udfs/community/README.md index 94dc2f6f4..7e732e912 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -39,6 +39,7 @@ SELECT bqutil.fn.int(1.684) * [median](#medianarr-any-type) * [nlp_compromise_number](#nlp_compromise_numberstr-string) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) +* [normal_cdf](#normal_cdfarrayarray) * [percentage_change](#percentage_changeval1-float64-val2-float64) * [percentage_difference](#percentage_differenceval1-float64-val2-float64) * [pi](#pi) From 24cc1486b76bd252741b515e11c9734314e0467c Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 21 Dec 2021 20:04:59 +0000 Subject: [PATCH 090/104] Fixed function name. --- udfs/community/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 7e732e912..4a528385b 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -993,7 +993,7 @@ Returns the value of x in the cdf of the Normal distribution with parameters mea Sample Query: ```SQL -SELECT `bigquery-utils-stat.fn.normal_cdf`(1.1, 1.7, 2.0) as normal_cdf; +SELECT `bqutils.fn.normal_cdf`(1.1, 1.7, 2.0) as normal_cdf; ``` Results: From cd5bead0c043ab9da2bc2e8c59e658796a5e8d4b Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 21 Dec 2021 20:11:26 +0000 Subject: [PATCH 091/104] Reinstating t-test --- udfs/community/README.md | 53 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 4a528385b..cb07cb2bd 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -39,7 +39,6 @@ SELECT bqutil.fn.int(1.684) * [median](#medianarr-any-type) * [nlp_compromise_number](#nlp_compromise_numberstr-string) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) -* [normal_cdf](#normal_cdfarrayarray) * [percentage_change](#percentage_changeval1-float64-val2-float64) * [percentage_difference](#percentage_differenceval1-float64-val2-float64) * [pi](#pi) @@ -986,7 +985,56 @@ Output: | 0.0 | 9.391056991171487E-4 | ----- -### [normal_cdf(ARRAY,ARRAY)](t_test.sql) +### [t_test(ARRAY,ARRAY)](t_test.sqlx) + +Runs the Student's T-test. Well known test to compare populations. Example taken from here: [Sample](https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test/two-sample-t-test.html) + +Sample Query: + +```SQL +DECLARE pop1 ARRAY; +DECLARE pop2 ARRAY; + +SET pop1 = [13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0]; +SET pop2 = [22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0] ; + +SELECT `bqutils.fn.t_test`(pop1, pop2) AS actual_result_rows; + +``` + +Results: + +| Row | actual_result_rows.t_value | actual_result_rows.dof| +|-----|----------------------------|-----------------------| +| 1 | 2.8957935572829476 | 21 + +----- +### [t_test(ARRAY,ARRAY)](t_test.sqlx) + +Runs the Student's T-test. Well known test to compare populations. Example taken from here: [Sample](https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test/two-sample-t-test.html) + +Sample Query: + +```SQL +DECLARE pop1 ARRAY; +DECLARE pop2 ARRAY; + +SET pop1 = [13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0]; +SET pop2 = [22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0] ; + +SELECT `bqutils.fn.t_test`(pop1, pop2) AS actual_result_rows; + +``` + +Results: + +| Row | actual_result_rows.t_value | actual_result_rows.dof| +|-----|----------------------------|-----------------------| +| 1 | 2.8957935572829476 | 21 + + +----- +### [normal_cdf(x FLOAT64, mean FLOAT64, stdev FLOAT64)](normal_cdf.sqlx) Returns the value of x in the cdf of the Normal distribution with parameters mean and std (standard deviation). @@ -1001,4 +1049,3 @@ Results: | Row | normal_cdf | |-----|-------------------| | 1 | 0.3820885778110474 | - \ No newline at end of file From 18c5eea267096ca573c3837dda171e7ef30c964f Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 21 Dec 2021 20:13:08 +0000 Subject: [PATCH 092/104] Fixed link normal_cdf --- udfs/community/README.md | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index cb07cb2bd..7101bd666 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -39,6 +39,7 @@ SELECT bqutil.fn.int(1.684) * [median](#medianarr-any-type) * [nlp_compromise_number](#nlp_compromise_numberstr-string) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) +* [normal_cdf](#normal_cdfx-float64-mean-float64-stdev-float64) * [percentage_change](#percentage_changeval1-float64-val2-float64) * [percentage_difference](#percentage_differenceval1-float64-val2-float64) * [pi](#pi) @@ -1008,30 +1009,6 @@ Results: |-----|----------------------------|-----------------------| | 1 | 2.8957935572829476 | 21 ------ -### [t_test(ARRAY,ARRAY)](t_test.sqlx) - -Runs the Student's T-test. Well known test to compare populations. Example taken from here: [Sample](https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test/two-sample-t-test.html) - -Sample Query: - -```SQL -DECLARE pop1 ARRAY; -DECLARE pop2 ARRAY; - -SET pop1 = [13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0]; -SET pop2 = [22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0] ; - -SELECT `bqutils.fn.t_test`(pop1, pop2) AS actual_result_rows; - -``` - -Results: - -| Row | actual_result_rows.t_value | actual_result_rows.dof| -|-----|----------------------------|-----------------------| -| 1 | 2.8957935572829476 | 21 - ----- ### [normal_cdf(x FLOAT64, mean FLOAT64, stdev FLOAT64)](normal_cdf.sqlx) From 13ef72e7590556ccdc433abff593485d264244df Mon Sep 17 00:00:00 2001 From: Daniel De Leo Date: Tue, 4 Jan 2022 11:40:14 -0500 Subject: [PATCH 093/104] trailing newline --- stored_procedures/linear_regression.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/stored_procedures/linear_regression.sql b/stored_procedures/linear_regression.sql index cc156792d..6e9bafde2 100644 --- a/stored_procedures/linear_regression.sql +++ b/stored_procedures/linear_regression.sql @@ -218,3 +218,4 @@ BEGIN ASSERT ROUND(result.b, 11) = -0.62754617565; ASSERT ROUND(result.r, 11) = -0.35654408961; END; + From 5efa96ba452e85cf214d4a8dca09ec7b82947dea Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 18 Jan 2022 20:34:26 +0000 Subject: [PATCH 094/104] Anova created. --- udfs/community/anovaftest.sqlx | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 udfs/community/anovaftest.sqlx diff --git a/udfs/community/anovaftest.sqlx b/udfs/community/anovaftest.sqlx new file mode 100644 index 000000000..c23a8115e --- /dev/null +++ b/udfs/community/anovaftest.sqlx @@ -0,0 +1,27 @@ +config { hasOutput: true } +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#standardSQL + +CREATE OR REPLACE FUNCTION ${self()}(H FLOAT64, dof INT64) +RETURNS FLOAT64 +LANGUAGE js AS """ + return 1.0 - jstat.jStat['chisquare'].cdf(H, dof) +""" +OPTIONS ( + library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] +); + From bf2b246f95a6472d3421a441d0625e25c883c83a Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 15 Feb 2022 10:16:35 -0800 Subject: [PATCH 095/104] feat: implement t-test paired stored proc --- stored_procedures/t_test_paired.sql | 223 ++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 stored_procedures/t_test_paired.sql diff --git a/stored_procedures/t_test_paired.sql b/stored_procedures/t_test_paired.sql new file mode 100644 index 000000000..a7e90e28f --- /dev/null +++ b/stored_procedures/t_test_paired.sql @@ -0,0 +1,223 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- @param STRING table_name table (or subquery) that contains the data +-- @param STRING name of the column in our table that represents the initial observation variable +-- @param STRING name of the column in our table that represents the final (paired) observation variable +-- @return STRUCT + +CREATE OR REPLACE PROCEDURE bqutil.procedure.t_test_paired (table_name STRING, pop1_variable STRING, pop2_variable STRING, OUT result STRUCT ) +BEGIN +EXECUTE IMMEDIATE """ + WITH pop1 AS ( + SELECT `""" || pop1_variable || """` AS value + FROM """ || table_name ||""" + ), pop2 as ( + SELECT `""" || pop2_variable || """` AS value + FROM """ || table_name || """ + ) + SELECT STRUCT( + ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)) AS t_value, + n1 + n2 - 2 AS df, + bqutil.fn.pvalue(ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)), n1 + n2 - 2) AS p_value + ) + FROM ( + SELECT + AVG(value) x1, + STDDEV(value) st1, + COUNT(value) AS n1 + FROM pop1 + ) + CROSS JOIN + ( + SELECT + AVG(value) x2, + STDDEV(value) st2, + COUNT(value) AS n2 + FROM pop2 + ) +""" INTO result; +END; + +-- a unit test of t_test_paired +BEGIN + DECLARE result STRUCT; + CREATE TEMP TABLE iris (sepal_length FLOAT64, sepal_width FLOAT64, petal_length FLOAT64, petal_width FLOAT64, species STRING) + AS + SELECT 5.1 AS sepal_length, + 3.5 AS sepal_width, + 1.4 AS petal_length, + 0.2 AS petal_width, + 'setosa' AS species + UNION ALL SELECT 4.9,3.0,1.4,0.2,'setosa' + UNION ALL SELECT 4.7,3.2,1.3,0.2,'setosa' + UNION ALL SELECT 4.6,3.1,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.6,1.4,0.2,'setosa' + UNION ALL SELECT 5.4,3.9,1.7,0.4,'setosa' + UNION ALL SELECT 4.6,3.4,1.4,0.3,'setosa' + UNION ALL SELECT 5.0,3.4,1.5,0.2,'setosa' + UNION ALL SELECT 4.4,2.9,1.4,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.4,3.7,1.5,0.2,'setosa' + UNION ALL SELECT 4.8,3.4,1.6,0.2,'setosa' + UNION ALL SELECT 4.8,3.0,1.4,0.1,'setosa' + UNION ALL SELECT 4.3,3.0,1.1,0.1,'setosa' + UNION ALL SELECT 5.8,4.0,1.2,0.2,'setosa' + UNION ALL SELECT 5.7,4.4,1.5,0.4,'setosa' + UNION ALL SELECT 5.4,3.9,1.3,0.4,'setosa' + UNION ALL SELECT 5.1,3.5,1.4,0.3,'setosa' + UNION ALL SELECT 5.7,3.8,1.7,0.3,'setosa' + UNION ALL SELECT 5.1,3.8,1.5,0.3,'setosa' + UNION ALL SELECT 5.4,3.4,1.7,0.2,'setosa' + UNION ALL SELECT 5.1,3.7,1.5,0.4,'setosa' + UNION ALL SELECT 4.6,3.6,1.0,0.2,'setosa' + UNION ALL SELECT 5.1,3.3,1.7,0.5,'setosa' + UNION ALL SELECT 4.8,3.4,1.9,0.2,'setosa' + UNION ALL SELECT 5.0,3.0,1.6,0.2,'setosa' + UNION ALL SELECT 5.0,3.4,1.6,0.4,'setosa' + UNION ALL SELECT 5.2,3.5,1.5,0.2,'setosa' + UNION ALL SELECT 5.2,3.4,1.4,0.2,'setosa' + UNION ALL SELECT 4.7,3.2,1.6,0.2,'setosa' + UNION ALL SELECT 4.8,3.1,1.6,0.2,'setosa' + UNION ALL SELECT 5.4,3.4,1.5,0.4,'setosa' + UNION ALL SELECT 5.2,4.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.5,4.2,1.4,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.0,3.2,1.2,0.2,'setosa' + UNION ALL SELECT 5.5,3.5,1.3,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 4.4,3.0,1.3,0.2,'setosa' + UNION ALL SELECT 5.1,3.4,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.5,1.3,0.3,'setosa' + UNION ALL SELECT 4.5,2.3,1.3,0.3,'setosa' + UNION ALL SELECT 4.4,3.2,1.3,0.2,'setosa' + UNION ALL SELECT 5.0,3.5,1.6,0.6,'setosa' + UNION ALL SELECT 5.1,3.8,1.9,0.4,'setosa' + UNION ALL SELECT 4.8,3.0,1.4,0.3,'setosa' + UNION ALL SELECT 5.1,3.8,1.6,0.2,'setosa' + UNION ALL SELECT 4.6,3.2,1.4,0.2,'setosa' + UNION ALL SELECT 5.3,3.7,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.3,1.4,0.2,'setosa' + UNION ALL SELECT 7.0,3.2,4.7,1.4,'versicolor' + UNION ALL SELECT 6.4,3.2,4.5,1.5,'versicolor' + UNION ALL SELECT 6.9,3.1,4.9,1.5,'versicolor' + UNION ALL SELECT 5.5,2.3,4.0,1.3,'versicolor' + UNION ALL SELECT 6.5,2.8,4.6,1.5,'versicolor' + UNION ALL SELECT 5.7,2.8,4.5,1.3,'versicolor' + UNION ALL SELECT 6.3,3.3,4.7,1.6,'versicolor' + UNION ALL SELECT 4.9,2.4,3.3,1.0,'versicolor' + UNION ALL SELECT 6.6,2.9,4.6,1.3,'versicolor' + UNION ALL SELECT 5.2,2.7,3.9,1.4,'versicolor' + UNION ALL SELECT 5.0,2.0,3.5,1.0,'versicolor' + UNION ALL SELECT 5.9,3.0,4.2,1.5,'versicolor' + UNION ALL SELECT 6.0,2.2,4.0,1.0,'versicolor' + UNION ALL SELECT 6.1,2.9,4.7,1.4,'versicolor' + UNION ALL SELECT 5.6,2.9,3.6,1.3,'versicolor' + UNION ALL SELECT 6.7,3.1,4.4,1.4,'versicolor' + UNION ALL SELECT 5.6,3.0,4.5,1.5,'versicolor' + UNION ALL SELECT 5.8,2.7,4.1,1.0,'versicolor' + UNION ALL SELECT 6.2,2.2,4.5,1.5,'versicolor' + UNION ALL SELECT 5.6,2.5,3.9,1.1,'versicolor' + UNION ALL SELECT 5.9,3.2,4.8,1.8,'versicolor' + UNION ALL SELECT 6.1,2.8,4.0,1.3,'versicolor' + UNION ALL SELECT 6.3,2.5,4.9,1.5,'versicolor' + UNION ALL SELECT 6.1,2.8,4.7,1.2,'versicolor' + UNION ALL SELECT 6.4,2.9,4.3,1.3,'versicolor' + UNION ALL SELECT 6.6,3.0,4.4,1.4,'versicolor' + UNION ALL SELECT 6.8,2.8,4.8,1.4,'versicolor' + UNION ALL SELECT 6.7,3.0,5.0,1.7,'versicolor' + UNION ALL SELECT 6.0,2.9,4.5,1.5,'versicolor' + UNION ALL SELECT 5.7,2.6,3.5,1.0,'versicolor' + UNION ALL SELECT 5.5,2.4,3.8,1.1,'versicolor' + UNION ALL SELECT 5.5,2.4,3.7,1.0,'versicolor' + UNION ALL SELECT 5.8,2.7,3.9,1.2,'versicolor' + UNION ALL SELECT 6.0,2.7,5.1,1.6,'versicolor' + UNION ALL SELECT 5.4,3.0,4.5,1.5,'versicolor' + UNION ALL SELECT 6.0,3.4,4.5,1.6,'versicolor' + UNION ALL SELECT 6.7,3.1,4.7,1.5,'versicolor' + UNION ALL SELECT 6.3,2.3,4.4,1.3,'versicolor' + UNION ALL SELECT 5.6,3.0,4.1,1.3,'versicolor' + UNION ALL SELECT 5.5,2.5,4.0,1.3,'versicolor' + UNION ALL SELECT 5.5,2.6,4.4,1.2,'versicolor' + UNION ALL SELECT 6.1,3.0,4.6,1.4,'versicolor' + UNION ALL SELECT 5.8,2.6,4.0,1.2,'versicolor' + UNION ALL SELECT 5.0,2.3,3.3,1.0,'versicolor' + UNION ALL SELECT 5.6,2.7,4.2,1.3,'versicolor' + UNION ALL SELECT 5.7,3.0,4.2,1.2,'versicolor' + UNION ALL SELECT 5.7,2.9,4.2,1.3,'versicolor' + UNION ALL SELECT 6.2,2.9,4.3,1.3,'versicolor' + UNION ALL SELECT 5.1,2.5,3.0,1.1,'versicolor' + UNION ALL SELECT 5.7,2.8,4.1,1.3,'versicolor' + UNION ALL SELECT 6.3,3.3,6.0,2.5,'virginica' + UNION ALL SELECT 5.8,2.7,5.1,1.9,'virginica' + UNION ALL SELECT 7.1,3.0,5.9,2.1,'virginica' + UNION ALL SELECT 6.3,2.9,5.6,1.8,'virginica' + UNION ALL SELECT 6.5,3.0,5.8,2.2,'virginica' + UNION ALL SELECT 7.6,3.0,6.6,2.1,'virginica' + UNION ALL SELECT 4.9,2.5,4.5,1.7,'virginica' + UNION ALL SELECT 7.3,2.9,6.3,1.8,'virginica' + UNION ALL SELECT 6.7,2.5,5.8,1.8,'virginica' + UNION ALL SELECT 7.2,3.6,6.1,2.5,'virginica' + UNION ALL SELECT 6.5,3.2,5.1,2.0,'virginica' + UNION ALL SELECT 6.4,2.7,5.3,1.9,'virginica' + UNION ALL SELECT 6.8,3.0,5.5,2.1,'virginica' + UNION ALL SELECT 5.7,2.5,5.0,2.0,'virginica' + UNION ALL SELECT 5.8,2.8,5.1,2.4,'virginica' + UNION ALL SELECT 6.4,3.2,5.3,2.3,'virginica' + UNION ALL SELECT 6.5,3.0,5.5,1.8,'virginica' + UNION ALL SELECT 7.7,3.8,6.7,2.2,'virginica' + UNION ALL SELECT 7.7,2.6,6.9,2.3,'virginica' + UNION ALL SELECT 6.0,2.2,5.0,1.5,'virginica' + UNION ALL SELECT 6.9,3.2,5.7,2.3,'virginica' + UNION ALL SELECT 5.6,2.8,4.9,2.0,'virginica' + UNION ALL SELECT 7.7,2.8,6.7,2.0,'virginica' + UNION ALL SELECT 6.3,2.7,4.9,1.8,'virginica' + UNION ALL SELECT 6.7,3.3,5.7,2.1,'virginica' + UNION ALL SELECT 7.2,3.2,6.0,1.8,'virginica' + UNION ALL SELECT 6.2,2.8,4.8,1.8,'virginica' + UNION ALL SELECT 6.1,3.0,4.9,1.8,'virginica' + UNION ALL SELECT 6.4,2.8,5.6,2.1,'virginica' + UNION ALL SELECT 7.2,3.0,5.8,1.6,'virginica' + UNION ALL SELECT 7.4,2.8,6.1,1.9,'virginica' + UNION ALL SELECT 7.9,3.8,6.4,2.0,'virginica' + UNION ALL SELECT 6.4,2.8,5.6,2.2,'virginica' + UNION ALL SELECT 6.3,2.8,5.1,1.5,'virginica' + UNION ALL SELECT 6.1,2.6,5.6,1.4,'virginica' + UNION ALL SELECT 7.7,3.0,6.1,2.3,'virginica' + UNION ALL SELECT 6.3,3.4,5.6,2.4,'virginica' + UNION ALL SELECT 6.4,3.1,5.5,1.8,'virginica' + UNION ALL SELECT 6.0,3.0,4.8,1.8,'virginica' + UNION ALL SELECT 6.9,3.1,5.4,2.1,'virginica' + UNION ALL SELECT 6.7,3.1,5.6,2.4,'virginica' + UNION ALL SELECT 6.9,3.1,5.1,2.3,'virginica' + UNION ALL SELECT 5.8,2.7,5.1,1.9,'virginica' + UNION ALL SELECT 6.8,3.2,5.9,2.3,'virginica' + UNION ALL SELECT 6.7,3.3,5.7,2.5,'virginica' + UNION ALL SELECT 6.7,3.0,5.2,2.3,'virginica' + UNION ALL SELECT 6.3,2.5,5.0,1.9,'virginica' + UNION ALL SELECT 6.5,3.0,5.2,2.0,'virginica' + UNION ALL SELECT 6.2,3.4,5.4,2.3,'virginica' + UNION ALL SELECT 5.9,3.0,5.1,1.8,'virginica'; + + + CALL bqutil.procedure.t_test_paired('iris', 'sepal_width', 'petal_width', result); + + -- We round to 11 decimals here because there appears to be some inconsistency in the function, likely due to floating point errors and the order of aggregation + ASSERT ROUND(result.t_value, 10) = 25.88834390257; + ASSERT result.df = 298; + ASSERT ROUND(result.p_value, 10) = 1.0; +END; + From 7e817686a1e0ef57df52692eb37cc6e6dfb99b58 Mon Sep 17 00:00:00 2001 From: Ian Mathews Date: Tue, 15 Feb 2022 15:31:24 -0800 Subject: [PATCH 096/104] feat: t_tests - implement t_test stored proc - implement t_test_paired udf - add p_value to t_test udfs --- stored_procedures/t_test.sql | 224 ++++++++++++++++++++++++++++ stored_procedures/t_test_paired.sql | 47 +++--- udfs/community/t_test.sqlx | 6 +- udfs/community/t_test_paired.sqlx | 56 +++++++ udfs/community/test_cases.js | 17 ++- 5 files changed, 316 insertions(+), 34 deletions(-) create mode 100644 stored_procedures/t_test.sql create mode 100644 udfs/community/t_test_paired.sqlx diff --git a/stored_procedures/t_test.sql b/stored_procedures/t_test.sql new file mode 100644 index 000000000..e567e6d12 --- /dev/null +++ b/stored_procedures/t_test.sql @@ -0,0 +1,224 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- @param STRING pop1_table table name (or subquery) that contains the first population +-- @param STRING pop1_variable name of the measurement column in our first table +-- @param STRING pop2_table table name (or subquery) that contains the second population +-- @param STRING pop2_variable name of the measurement column in our second table +-- @return STRUCT + +CREATE OR REPLACE PROCEDURE bqutil.procedure.t_test(pop1_table STRING, pop1_variable STRING, pop2_table STRING, pop2_variable STRING, OUT result STRUCT ) +BEGIN +EXECUTE IMMEDIATE """ + WITH pop1 AS ( + SELECT `""" || pop1_variable || """` AS value + FROM """ || pop1_table ||""" + ), pop2 as ( + SELECT `""" || pop2_variable || """` AS value + FROM """ || pop2_table || """ + ) + SELECT STRUCT( + ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)) AS t_value, + n1 + n2 - 2 AS dof, + bqutil.fn.pvalue(ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)), n1 + n2 - 2) AS p_value + ) + FROM ( + SELECT + AVG(value) x1, + STDDEV(value) st1, + COUNT(value) AS n1 + FROM pop1 + ) + CROSS JOIN + ( + SELECT + AVG(value) x2, + STDDEV(value) st2, + COUNT(value) AS n2 + FROM pop2 + ) +""" INTO result; +END; + +-- a unit test of t_test_paired +BEGIN + DECLARE result STRUCT; + CREATE TEMP TABLE iris (sepal_length FLOAT64, sepal_width FLOAT64, petal_length FLOAT64, petal_width FLOAT64, species STRING) + AS + SELECT 5.1 AS sepal_length, + 3.5 AS sepal_width, + 1.4 AS petal_length, + 0.2 AS petal_width, + 'setosa' AS species + UNION ALL SELECT 4.9,3.0,1.4,0.2,'setosa' + UNION ALL SELECT 4.7,3.2,1.3,0.2,'setosa' + UNION ALL SELECT 4.6,3.1,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.6,1.4,0.2,'setosa' + UNION ALL SELECT 5.4,3.9,1.7,0.4,'setosa' + UNION ALL SELECT 4.6,3.4,1.4,0.3,'setosa' + UNION ALL SELECT 5.0,3.4,1.5,0.2,'setosa' + UNION ALL SELECT 4.4,2.9,1.4,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.4,3.7,1.5,0.2,'setosa' + UNION ALL SELECT 4.8,3.4,1.6,0.2,'setosa' + UNION ALL SELECT 4.8,3.0,1.4,0.1,'setosa' + UNION ALL SELECT 4.3,3.0,1.1,0.1,'setosa' + UNION ALL SELECT 5.8,4.0,1.2,0.2,'setosa' + UNION ALL SELECT 5.7,4.4,1.5,0.4,'setosa' + UNION ALL SELECT 5.4,3.9,1.3,0.4,'setosa' + UNION ALL SELECT 5.1,3.5,1.4,0.3,'setosa' + UNION ALL SELECT 5.7,3.8,1.7,0.3,'setosa' + UNION ALL SELECT 5.1,3.8,1.5,0.3,'setosa' + UNION ALL SELECT 5.4,3.4,1.7,0.2,'setosa' + UNION ALL SELECT 5.1,3.7,1.5,0.4,'setosa' + UNION ALL SELECT 4.6,3.6,1.0,0.2,'setosa' + UNION ALL SELECT 5.1,3.3,1.7,0.5,'setosa' + UNION ALL SELECT 4.8,3.4,1.9,0.2,'setosa' + UNION ALL SELECT 5.0,3.0,1.6,0.2,'setosa' + UNION ALL SELECT 5.0,3.4,1.6,0.4,'setosa' + UNION ALL SELECT 5.2,3.5,1.5,0.2,'setosa' + UNION ALL SELECT 5.2,3.4,1.4,0.2,'setosa' + UNION ALL SELECT 4.7,3.2,1.6,0.2,'setosa' + UNION ALL SELECT 4.8,3.1,1.6,0.2,'setosa' + UNION ALL SELECT 5.4,3.4,1.5,0.4,'setosa' + UNION ALL SELECT 5.2,4.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.5,4.2,1.4,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 5.0,3.2,1.2,0.2,'setosa' + UNION ALL SELECT 5.5,3.5,1.3,0.2,'setosa' + UNION ALL SELECT 4.9,3.1,1.5,0.1,'setosa' + UNION ALL SELECT 4.4,3.0,1.3,0.2,'setosa' + UNION ALL SELECT 5.1,3.4,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.5,1.3,0.3,'setosa' + UNION ALL SELECT 4.5,2.3,1.3,0.3,'setosa' + UNION ALL SELECT 4.4,3.2,1.3,0.2,'setosa' + UNION ALL SELECT 5.0,3.5,1.6,0.6,'setosa' + UNION ALL SELECT 5.1,3.8,1.9,0.4,'setosa' + UNION ALL SELECT 4.8,3.0,1.4,0.3,'setosa' + UNION ALL SELECT 5.1,3.8,1.6,0.2,'setosa' + UNION ALL SELECT 4.6,3.2,1.4,0.2,'setosa' + UNION ALL SELECT 5.3,3.7,1.5,0.2,'setosa' + UNION ALL SELECT 5.0,3.3,1.4,0.2,'setosa' + UNION ALL SELECT 7.0,3.2,4.7,1.4,'versicolor' + UNION ALL SELECT 6.4,3.2,4.5,1.5,'versicolor' + UNION ALL SELECT 6.9,3.1,4.9,1.5,'versicolor' + UNION ALL SELECT 5.5,2.3,4.0,1.3,'versicolor' + UNION ALL SELECT 6.5,2.8,4.6,1.5,'versicolor' + UNION ALL SELECT 5.7,2.8,4.5,1.3,'versicolor' + UNION ALL SELECT 6.3,3.3,4.7,1.6,'versicolor' + UNION ALL SELECT 4.9,2.4,3.3,1.0,'versicolor' + UNION ALL SELECT 6.6,2.9,4.6,1.3,'versicolor' + UNION ALL SELECT 5.2,2.7,3.9,1.4,'versicolor' + UNION ALL SELECT 5.0,2.0,3.5,1.0,'versicolor' + UNION ALL SELECT 5.9,3.0,4.2,1.5,'versicolor' + UNION ALL SELECT 6.0,2.2,4.0,1.0,'versicolor' + UNION ALL SELECT 6.1,2.9,4.7,1.4,'versicolor' + UNION ALL SELECT 5.6,2.9,3.6,1.3,'versicolor' + UNION ALL SELECT 6.7,3.1,4.4,1.4,'versicolor' + UNION ALL SELECT 5.6,3.0,4.5,1.5,'versicolor' + UNION ALL SELECT 5.8,2.7,4.1,1.0,'versicolor' + UNION ALL SELECT 6.2,2.2,4.5,1.5,'versicolor' + UNION ALL SELECT 5.6,2.5,3.9,1.1,'versicolor' + UNION ALL SELECT 5.9,3.2,4.8,1.8,'versicolor' + UNION ALL SELECT 6.1,2.8,4.0,1.3,'versicolor' + UNION ALL SELECT 6.3,2.5,4.9,1.5,'versicolor' + UNION ALL SELECT 6.1,2.8,4.7,1.2,'versicolor' + UNION ALL SELECT 6.4,2.9,4.3,1.3,'versicolor' + UNION ALL SELECT 6.6,3.0,4.4,1.4,'versicolor' + UNION ALL SELECT 6.8,2.8,4.8,1.4,'versicolor' + UNION ALL SELECT 6.7,3.0,5.0,1.7,'versicolor' + UNION ALL SELECT 6.0,2.9,4.5,1.5,'versicolor' + UNION ALL SELECT 5.7,2.6,3.5,1.0,'versicolor' + UNION ALL SELECT 5.5,2.4,3.8,1.1,'versicolor' + UNION ALL SELECT 5.5,2.4,3.7,1.0,'versicolor' + UNION ALL SELECT 5.8,2.7,3.9,1.2,'versicolor' + UNION ALL SELECT 6.0,2.7,5.1,1.6,'versicolor' + UNION ALL SELECT 5.4,3.0,4.5,1.5,'versicolor' + UNION ALL SELECT 6.0,3.4,4.5,1.6,'versicolor' + UNION ALL SELECT 6.7,3.1,4.7,1.5,'versicolor' + UNION ALL SELECT 6.3,2.3,4.4,1.3,'versicolor' + UNION ALL SELECT 5.6,3.0,4.1,1.3,'versicolor' + UNION ALL SELECT 5.5,2.5,4.0,1.3,'versicolor' + UNION ALL SELECT 5.5,2.6,4.4,1.2,'versicolor' + UNION ALL SELECT 6.1,3.0,4.6,1.4,'versicolor' + UNION ALL SELECT 5.8,2.6,4.0,1.2,'versicolor' + UNION ALL SELECT 5.0,2.3,3.3,1.0,'versicolor' + UNION ALL SELECT 5.6,2.7,4.2,1.3,'versicolor' + UNION ALL SELECT 5.7,3.0,4.2,1.2,'versicolor' + UNION ALL SELECT 5.7,2.9,4.2,1.3,'versicolor' + UNION ALL SELECT 6.2,2.9,4.3,1.3,'versicolor' + UNION ALL SELECT 5.1,2.5,3.0,1.1,'versicolor' + UNION ALL SELECT 5.7,2.8,4.1,1.3,'versicolor' + UNION ALL SELECT 6.3,3.3,6.0,2.5,'virginica' + UNION ALL SELECT 5.8,2.7,5.1,1.9,'virginica' + UNION ALL SELECT 7.1,3.0,5.9,2.1,'virginica' + UNION ALL SELECT 6.3,2.9,5.6,1.8,'virginica' + UNION ALL SELECT 6.5,3.0,5.8,2.2,'virginica' + UNION ALL SELECT 7.6,3.0,6.6,2.1,'virginica' + UNION ALL SELECT 4.9,2.5,4.5,1.7,'virginica' + UNION ALL SELECT 7.3,2.9,6.3,1.8,'virginica' + UNION ALL SELECT 6.7,2.5,5.8,1.8,'virginica' + UNION ALL SELECT 7.2,3.6,6.1,2.5,'virginica' + UNION ALL SELECT 6.5,3.2,5.1,2.0,'virginica' + UNION ALL SELECT 6.4,2.7,5.3,1.9,'virginica' + UNION ALL SELECT 6.8,3.0,5.5,2.1,'virginica' + UNION ALL SELECT 5.7,2.5,5.0,2.0,'virginica' + UNION ALL SELECT 5.8,2.8,5.1,2.4,'virginica' + UNION ALL SELECT 6.4,3.2,5.3,2.3,'virginica' + UNION ALL SELECT 6.5,3.0,5.5,1.8,'virginica' + UNION ALL SELECT 7.7,3.8,6.7,2.2,'virginica' + UNION ALL SELECT 7.7,2.6,6.9,2.3,'virginica' + UNION ALL SELECT 6.0,2.2,5.0,1.5,'virginica' + UNION ALL SELECT 6.9,3.2,5.7,2.3,'virginica' + UNION ALL SELECT 5.6,2.8,4.9,2.0,'virginica' + UNION ALL SELECT 7.7,2.8,6.7,2.0,'virginica' + UNION ALL SELECT 6.3,2.7,4.9,1.8,'virginica' + UNION ALL SELECT 6.7,3.3,5.7,2.1,'virginica' + UNION ALL SELECT 7.2,3.2,6.0,1.8,'virginica' + UNION ALL SELECT 6.2,2.8,4.8,1.8,'virginica' + UNION ALL SELECT 6.1,3.0,4.9,1.8,'virginica' + UNION ALL SELECT 6.4,2.8,5.6,2.1,'virginica' + UNION ALL SELECT 7.2,3.0,5.8,1.6,'virginica' + UNION ALL SELECT 7.4,2.8,6.1,1.9,'virginica' + UNION ALL SELECT 7.9,3.8,6.4,2.0,'virginica' + UNION ALL SELECT 6.4,2.8,5.6,2.2,'virginica' + UNION ALL SELECT 6.3,2.8,5.1,1.5,'virginica' + UNION ALL SELECT 6.1,2.6,5.6,1.4,'virginica' + UNION ALL SELECT 7.7,3.0,6.1,2.3,'virginica' + UNION ALL SELECT 6.3,3.4,5.6,2.4,'virginica' + UNION ALL SELECT 6.4,3.1,5.5,1.8,'virginica' + UNION ALL SELECT 6.0,3.0,4.8,1.8,'virginica' + UNION ALL SELECT 6.9,3.1,5.4,2.1,'virginica' + UNION ALL SELECT 6.7,3.1,5.6,2.4,'virginica' + UNION ALL SELECT 6.9,3.1,5.1,2.3,'virginica' + UNION ALL SELECT 5.8,2.7,5.1,1.9,'virginica' + UNION ALL SELECT 6.8,3.2,5.9,2.3,'virginica' + UNION ALL SELECT 6.7,3.3,5.7,2.5,'virginica' + UNION ALL SELECT 6.7,3.0,5.2,2.3,'virginica' + UNION ALL SELECT 6.3,2.5,5.0,1.9,'virginica' + UNION ALL SELECT 6.5,3.0,5.2,2.0,'virginica' + UNION ALL SELECT 6.2,3.4,5.4,2.3,'virginica' + UNION ALL SELECT 5.9,3.0,5.1,1.8,'virginica'; + + + CALL bqutil.procedure.t_test('iris', 'sepal_width', 'iris', 'petal_width', result); + + -- We round to 11 decimals here because there appears to be some inconsistency in the function, likely due to floating point errors and the order of aggregation + ASSERT ROUND(result.t_value, 11) = 25.88834390257; + ASSERT result.dof = 298; + ASSERT result.p_value = 1.0; +END; + diff --git a/stored_procedures/t_test_paired.sql b/stored_procedures/t_test_paired.sql index a7e90e28f..6fcba9113 100644 --- a/stored_procedures/t_test_paired.sql +++ b/stored_procedures/t_test_paired.sql @@ -15,46 +15,37 @@ */ -- @param STRING table_name table (or subquery) that contains the data --- @param STRING name of the column in our table that represents the initial observation variable --- @param STRING name of the column in our table that represents the final (paired) observation variable --- @return STRUCT +-- @param STRING pop1_variable name of the column in our table that represents the initial observation variable +-- @param STRING pop2_variable name of the column in our table that represents the final (paired) observation variable +-- @return STRUCT -CREATE OR REPLACE PROCEDURE bqutil.procedure.t_test_paired (table_name STRING, pop1_variable STRING, pop2_variable STRING, OUT result STRUCT ) +CREATE OR REPLACE PROCEDURE bqutil.procedure.t_test_paired (table_name STRING, pop1_variable STRING, pop2_variable STRING, OUT result STRUCT ) BEGIN EXECUTE IMMEDIATE """ - WITH pop1 AS ( - SELECT `""" || pop1_variable || """` AS value - FROM """ || table_name ||""" - ), pop2 as ( - SELECT `""" || pop2_variable || """` AS value + WITH source_data AS ( + SELECT + """ || pop1_variable || """ AS val1, + """ || pop2_variable || """ as val2 FROM """ || table_name || """ ) SELECT STRUCT( - ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)) AS t_value, - n1 + n2 - 2 AS df, - bqutil.fn.pvalue(ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)), n1 + n2 - 2) AS p_value + mean / ( std / SQRT(N) ) AS t_value, + n-1 AS dof, + bqutil.fn.pvalue(mean / ( std / SQRT(N) ), n-1) AS p_value ) FROM ( SELECT - AVG(value) x1, - STDDEV(value) st1, - COUNT(value) AS n1 - FROM pop1 - ) - CROSS JOIN - ( - SELECT - AVG(value) x2, - STDDEV(value) st2, - COUNT(value) AS n2 - FROM pop2 + AVG( source_data.val1 - source_data.val2 ) as mean, + STDDEV_SAMP( source_data.val1 - source_data.val2 ) as std, + COUNT(*) as n + FROM source_data ) """ INTO result; END; -- a unit test of t_test_paired BEGIN - DECLARE result STRUCT; + DECLARE result STRUCT; CREATE TEMP TABLE iris (sepal_length FLOAT64, sepal_width FLOAT64, petal_length FLOAT64, petal_width FLOAT64, species STRING) AS SELECT 5.1 AS sepal_length, @@ -216,8 +207,8 @@ BEGIN CALL bqutil.procedure.t_test_paired('iris', 'sepal_width', 'petal_width', result); -- We round to 11 decimals here because there appears to be some inconsistency in the function, likely due to floating point errors and the order of aggregation - ASSERT ROUND(result.t_value, 10) = 25.88834390257; - ASSERT result.df = 298; - ASSERT ROUND(result.p_value, 10) = 1.0; + ASSERT ROUND(result.t_value, 11) = 22.65094961383; + ASSERT result.dof = 149; + ASSERT result.p_value = 1.0; END; diff --git a/udfs/community/t_test.sqlx b/udfs/community/t_test.sqlx index d2a89b5e9..7132d8bad 100644 --- a/udfs/community/t_test.sqlx +++ b/udfs/community/t_test.sqlx @@ -22,7 +22,7 @@ config { hasOutput: true } -- pop1, array of FLOAT64, values from first group -- pop2, array of FLOAT64, values from second group -- Output: --- Struct of t_value and DOF +-- Struct of t_value, dof, and p_value CREATE OR REPLACE FUNCTION ${self()}(pop1 ARRAY, pop2 ARRAY) AS (( @@ -42,7 +42,9 @@ CREATE OR REPLACE FUNCTION ${self()}(pop1 ARRAY, pop2 ARRAY) A SELECT STRUCT( ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)) AS t_value, - n1 + n2 - 2 AS dof) + n1 + n2 - 2 AS dof, + bqutil.fn.pvalue(ABS(x1 - x2) / Sqrt((st1 * st1 / n1) + (st2 * st2 / n2)), n1 + n2 - 2) AS p_value + ) FROM pop1 CROSS JOIN pop2 )); diff --git a/udfs/community/t_test_paired.sqlx b/udfs/community/t_test_paired.sqlx new file mode 100644 index 000000000..1680df9ec --- /dev/null +++ b/udfs/community/t_test_paired.sqlx @@ -0,0 +1,56 @@ +config { hasOutput: true } + +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +-- Paired (dependent samples) T-Test +-- Input: +-- pop1, array of FLOAT64, values from first group +-- pop2, array of FLOAT64, values from second group +-- Output: +-- Struct of t_value, p_value, and dof + +CREATE OR REPLACE FUNCTION ${self()}(pop1 ARRAY, pop2 ARRAY) AS (( + WITH t1 as ( + SELECT id, val + FROM UNNEST(pop1) as val + WITH OFFSET AS id + ORDER BY id + ), + t2 as ( + SELECT id, val + FROM UNNEST(pop2) as val + WITH OFFSET AS id + ORDER BY id + ) + SELECT + STRUCT( + mean / ( std / SQRT(N) ) AS t_value, + N-1 AS dof, + bqutil.fn.pvalue(mean / ( std / SQRT(N) ), n-1) AS p_value + ) + FROM ( + SELECT + AVG( t1.val - t2.val ) as mean, + STDDEV_SAMP( t1.val - t2.val ) as std, + COUNT(*) as N + FROM t1 + INNER JOIN t2 + ON t1.id = t2.id + ) +)); + diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index 01d1f3636..49e7e9ad6 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -935,16 +935,25 @@ generate_udf_test("p_fisherexact", [ ], expected_output: `CAST(8.046828829103659E-12 AS FLOAT64)` }, -]); +]); generate_udf_test("t_test", [ { inputs: [ `(SELECT ARRAY[13.3,6.0,20.0,8.0,14.0,19.0,18.0,25.0,16.0,24.0,15.0,1.0,15.0])`, - `(SELECT ARRAY[22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0])` + `(SELECT ARRAY[22.0,16.0,21.7,21.0,30.0,26.0,12.0,23.2,28.0,23.0])` ], - expected_output: `STRUCT(CAST(2.8957935572829476 AS FLOAT64) AS t_value, CAST(21 AS INTEGER) AS dof)` + expected_output: `STRUCT(CAST(2.8957935572829476 AS FLOAT64) AS t_value, CAST(21 AS INTEGER) AS dof, CAST(0.9999989005977227 AS FLOAT64) AS p_value)` }, -]); +]); +generate_udf_test("t_test_paired", [ + { + inputs: [ + `(SELECT ARRAY[ 38.25, 31.68, 26.24, 41.29, 44.81, 46.37, 35.42, 38.41, 42.68, 46.71, 29.20, 30.76])`, + `(SELECT ARRAY[ 38.27, 31.71, 26.22, 41.33, 44.80, 46.39, 35.46, 38.39, 42.72, 46.76, 29.18, 30.79])` + ], + expected_output: `STRUCT(CAST(-2.1589388479419087 AS FLOAT64) AS t_value, CAST(11 AS INTEGER) AS dof, CAST(1.0 AS FLOAT64) AS p_value)` + }, +]); generate_udf_test("mannwhitneyu", [ { inputs: [ From a8448c0095e58704b70dca3d45f2977550f9371f Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 12 Apr 2022 14:09:43 +0000 Subject: [PATCH 097/104] added testing version of ANOVA --- udfs/community/anovaftest.sqlx | 38 +++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/udfs/community/anovaftest.sqlx b/udfs/community/anovaftest.sqlx index c23a8115e..3e053733d 100644 --- a/udfs/community/anovaftest.sqlx +++ b/udfs/community/anovaftest.sqlx @@ -16,12 +16,34 @@ config { hasOutput: true } */ #standardSQL -CREATE OR REPLACE FUNCTION ${self()}(H FLOAT64, dof INT64) -RETURNS FLOAT64 -LANGUAGE js AS """ - return 1.0 - jstat.jStat['chisquare'].cdf(H, dof) -""" -OPTIONS ( - library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] -); +/* +execute using: `bq query < anovaftest.sqlx` +save iris dataset: `bq --format csv query "SELECT * FROM bigquery-public-data.ml_datasets.iris" > iris.csv` +*/ + +DECLARE data ARRAY>; +set data = (SELECT ARRAY(SELECT AS STRUCT species, petal_width FROM `bigquery-public-data.ml_datasets.iris`)); + +CREATE TEMP FUNCTION MMM (data ARRAY>) AS (( + WITH stats AS + ( + WITH raw_data AS + ( + SELECT d.val AS v, d.factor as f + FROM UNNEST(data) AS d + ) #raw_data + + SELECT "global" as label, VAR_SAMP(v) as var, COUNT(*) as cnt, COUNT(DISTINCT f) as discnt FROM raw_data + UNION ALL + SELECT "sample" as label, VAR_SAMP(v) as var, NULL as cnt, NULL as discnt FROM raw_data GROUP BY f + + + ) + + SELECT ((SELECT var from stats where label = "global") / ((SELECT cnt from stats where label = "global")-1)) + /((SELECT SUM(var) from stats where label = "sample") / ((SELECT cnt from stats where label = "global")-(SELECT discnt from stats where label = "global"))) + +)); + +SELECT MMM(data) AS results; \ No newline at end of file From cd9fb06fe8d152615dd887856a9b572fbb83488e Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 12 Apr 2022 14:13:38 +0000 Subject: [PATCH 098/104] commented out config temporarily --- udfs/community/anovaftest.sqlx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/anovaftest.sqlx b/udfs/community/anovaftest.sqlx index 3e053733d..b53eefbb7 100644 --- a/udfs/community/anovaftest.sqlx +++ b/udfs/community/anovaftest.sqlx @@ -1,4 +1,4 @@ -config { hasOutput: true } +-- config { hasOutput: true } /* * Copyright 2021 Google LLC * From 0c20220714acc7e31030d093cfba4f4fa9880a6f Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Wed, 11 May 2022 16:02:39 +0000 Subject: [PATCH 099/104] Added Anova. Trying to get testing to go through. --- udfs/community/anovaftest.sqlx | 52 ++++++++++++++-------------------- udfs/community/test_cases.js | 6 ++++ udfs/js_libs/js_libs.yaml | 3 +- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/udfs/community/anovaftest.sqlx b/udfs/community/anovaftest.sqlx index b53eefbb7..62df0f8d8 100644 --- a/udfs/community/anovaftest.sqlx +++ b/udfs/community/anovaftest.sqlx @@ -14,36 +14,26 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#standardSQL -/* -execute using: `bq query < anovaftest.sqlx` -save iris dataset: `bq --format csv query "SELECT * FROM bigquery-public-data.ml_datasets.iris" > iris.csv` -*/ - - -DECLARE data ARRAY>; -set data = (SELECT ARRAY(SELECT AS STRUCT species, petal_width FROM `bigquery-public-data.ml_datasets.iris`)); - -CREATE TEMP FUNCTION MMM (data ARRAY>) AS (( - WITH stats AS - ( - WITH raw_data AS - ( - SELECT d.val AS v, d.factor as f - FROM UNNEST(data) AS d - ) #raw_data - - SELECT "global" as label, VAR_SAMP(v) as var, COUNT(*) as cnt, COUNT(DISTINCT f) as discnt FROM raw_data - UNION ALL - SELECT "sample" as label, VAR_SAMP(v) as var, NULL as cnt, NULL as discnt FROM raw_data GROUP BY f - - - ) +CREATE OR REPLACE FUNCTION ${self()}(data ARRAY>) AS (( + WITH raw_data AS ( + SELECT d.val AS v, d.factor as f + FROM UNNEST(data) AS d + ), + global_stats AS ( + SELECT AVG(v) as M, COUNT(*) - COUNT(DISTINCT f) as cnt FROM raw_data + ), + exp_var AS ( + SELECT COUNT(v) * POW(AVG(v) - (select M from global_stats), 2) as var, NULL as cnt, NULL as discnt FROM raw_data GROUP BY f + ), + unexp_var as ( + SELECT COUNT(v) * VAR_POP(v) as var FROM raw_data GROUP BY f + ) - SELECT ((SELECT var from stats where label = "global") / ((SELECT cnt from stats where label = "global")-1)) - /((SELECT SUM(var) from stats where label = "sample") / ((SELECT cnt from stats where label = "global")-(SELECT discnt from stats where label = "global"))) - -)); - -SELECT MMM(data) AS results; \ No newline at end of file + SELECT + ( + (SELECT SUM(var) / (COUNT(var)-1) FROM exp_var) + / (SELECT SUM(var) / (select cnt from global_stats) FROM unexp_var) + ) + +)); \ No newline at end of file diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index 49e7e9ad6..9bd19b19a 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -896,6 +896,12 @@ generate_udf_test("kruskal_wallis", [ expected_output: `STRUCT(CAST(3.423076923076927 AS FLOAT64) AS H, CAST( 0.1805877514841956 AS FLOAT64) AS p, CAST(2 AS INT64) AS DoF)` }, ]); +generate_udf_test("anovaftest", [ + { + inputs: [ ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.2), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("virginica", 1.7), ("versicolor",1.6), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("versicolor",1.7), ("virginica", 1.9), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("setosa", 0.4), ("setosa", 0.4), ("versicolor",1.2), ("versicolor",1.4), ("versicolor",1.1), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.4), ("versicolor",1.8), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.2), ("virginica", 1.6), ("virginica", 1.8), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.5), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.4), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.4), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("virginica", 2.4), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 1.9), ("versicolor",1.6), ("virginica", 1.8), ("virginica", 2.4), ("virginica", 2.4), ("virginica", 1.4), ("virginica", 2.2), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.1), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ], + expected_output: `959.3244057257594 as results` + }, +]); generate_udf_test("linear_regression", [ { inputs: [`(SELECT [ (5.1,2.5), (5.0,2.0), (5.7,2.6), (6.0,2.2), (5.8,2.6), (5.5,2.3), (6.1,2.8), (5.5,2.5), (6.4,3.2), (5.6,3.0)])`], diff --git a/udfs/js_libs/js_libs.yaml b/udfs/js_libs/js_libs.yaml index b08a62cc0..7e23f63c7 100644 --- a/udfs/js_libs/js_libs.yaml +++ b/udfs/js_libs/js_libs.yaml @@ -7,4 +7,5 @@ js-levenshtein: jstat: versions: - 1.9.3 - - 1.9.4 \ No newline at end of file + - 1.9.4 + - 1.9.5 \ No newline at end of file From 59bb00043a5cd23c435d8480c287b8285512dd55 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 9 Jun 2022 14:24:01 +0000 Subject: [PATCH 100/104] Added Anova f test --- udfs/community/test_cases.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index 575c17771..c88315f08 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -1001,7 +1001,7 @@ generate_udf_test("kruskal_wallis", [ generate_udf_test("anovaftest", [ { inputs: [ ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.2), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("virginica", 1.7), ("versicolor",1.6), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("versicolor",1.7), ("virginica", 1.9), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("setosa", 0.4), ("setosa", 0.4), ("versicolor",1.2), ("versicolor",1.4), ("versicolor",1.1), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.4), ("versicolor",1.8), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.2), ("virginica", 1.6), ("virginica", 1.8), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.5), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.4), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.4), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("virginica", 2.4), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 1.9), ("versicolor",1.6), ("virginica", 1.8), ("virginica", 2.4), ("virginica", 2.4), ("virginica", 1.4), ("virginica", 2.2), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.1), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ], - expected_output: `959.3244057257594 as results` + expected_output: `CAST(959.3244057257594 AS FLOAT64)` }, ]); generate_udf_test("linear_regression", [ From 5dba30820f56151564d900078d54d7370bb4340e Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 9 Jun 2022 15:11:19 +0000 Subject: [PATCH 101/104] test --- udfs/community/anovaftest.sqlx | 2 -- udfs/community/test_cases.js | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/udfs/community/anovaftest.sqlx b/udfs/community/anovaftest.sqlx index 62df0f8d8..0d910dc33 100644 --- a/udfs/community/anovaftest.sqlx +++ b/udfs/community/anovaftest.sqlx @@ -31,9 +31,7 @@ CREATE OR REPLACE FUNCTION ${self()}(data ARRAY[ 38.27, 31.71, 26.22, 41.33, 44.80, 46.39, 35.46, 38.39, 42.72, 46.76, 29.18, 30.79])` ], expected_output: `STRUCT(CAST(-2.1589388479419087 AS FLOAT64) AS t_value, CAST(11 AS INTEGER) AS dof, CAST(1.0 AS FLOAT64) AS p_value)` + } ]); generate_udf_test("mannwhitneyu", [ From a3fce9b1d64676e37b7d1b179bd66fee8d67b53b Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Tue, 19 Jul 2022 22:01:18 +0000 Subject: [PATCH 102/104] Working anova f test --- udfs/community/anovaftest.sqlx | 2 +- udfs/community/test_cases.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udfs/community/anovaftest.sqlx b/udfs/community/anovaftest.sqlx index 0d910dc33..82e631fe2 100644 --- a/udfs/community/anovaftest.sqlx +++ b/udfs/community/anovaftest.sqlx @@ -1,4 +1,4 @@ --- config { hasOutput: true } +config { hasOutput: true } /* * Copyright 2021 Google LLC * diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index 0982f44a2..325b9eeaa 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -1000,7 +1000,7 @@ generate_udf_test("kruskal_wallis", [ ]); generate_udf_test("anovaftest", [ { - inputs: [ `(SELECT [("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.2), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("virginica", 1.7), ("versicolor",1.6), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("versicolor",1.7), ("virginica", 1.9), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("setosa", 0.4), ("setosa", 0.4), ("versicolor",1.2), ("versicolor",1.4), ("versicolor",1.1), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.4), ("versicolor",1.8), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.2), ("virginica", 1.6), ("virginica", 1.8), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.5), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.4), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.4), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("virginica", 2.4), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 1.9), ("versicolor",1.6), ("virginica", 1.8), ("virginica", 2.4), ("virginica", 2.4), ("virginica", 1.4), ("virginica", 2.2), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.1), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2)])`], + inputs: [`(SELECT [("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.2), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("virginica", 1.7), ("versicolor",1.6), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("versicolor",1.7), ("virginica", 1.9), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("setosa", 0.4), ("setosa", 0.4), ("versicolor",1.2), ("versicolor",1.4), ("versicolor",1.1), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.4), ("versicolor",1.8), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.2), ("virginica", 1.6), ("virginica", 1.8), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.5), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.4), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.4), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("virginica", 2.4), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 1.9), ("versicolor",1.6), ("virginica", 1.8), ("virginica", 2.4), ("virginica", 2.4), ("virginica", 1.4), ("virginica", 2.2), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.1), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("versicolor", 1), ("versicolor", 1.2), ("versicolor", 1.4), ("versicolor", 1.4), ("versicolor", 1.3), ("virginica", 2), ("versicolor", 1.5), ("versicolor", 1.5), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2), ("virginica", 2.3), ("setosa", 0.1), ("setosa", 0.6), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("versicolor", 1.3), ("versicolor", 1.5), ("versicolor", 1.2), ("versicolor", 1.3), ("versicolor", 1.3), ("versicolor", 1.5), ("versicolor", 1.2), ("versicolor", 1.4), ("versicolor", 1.4), ("versicolor", 1.6), ("virginica", 2), ("virginica", 2.3), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2), ("virginica", 2.2), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4) ])`], expected_output: `CAST(959.3244057257594 AS FLOAT64)` }, ]); From 1a5facb42413b678e0f218e63ab28a96d1ac631e Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 4 Aug 2022 13:11:21 -0400 Subject: [PATCH 103/104] added anovaftest --- udfs/community/README.md | 83 ++++++++++++++++++++++++++++++++ udfs/community/anovafscore.sqlx | 37 ++++++++++++++ udfs/community/anovaftest.sqlx | 15 +++--- udfs/community/centralF_cdf.sqlx | 27 +++++++++++ udfs/community/test_cases.js | 19 +++++++- 5 files changed, 171 insertions(+), 10 deletions(-) create mode 100644 udfs/community/anovafscore.sqlx create mode 100644 udfs/community/centralF_cdf.sqlx diff --git a/udfs/community/README.md b/udfs/community/README.md index cef6f3f8b..13dda77e1 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -844,6 +844,9 @@ SELECT bqutil.fn.int(1.684) ``` ## UDFs +* [anovafscore](#corr_pvaluer-float64-n-int64) +* [anovaftest](#corr_pvaluer-float64-n-int64) +* [centralF_udf](#corr_pvaluer-float64-n-int64) * [corr_pvalue](#corr_pvaluer-float64-n-int64) * [kruskal_wallis](#kruskal_wallisarraystructfactor-string-val-float64) * [linear_regression](#linear_regressionarraystructstructx-float64-y-float64) @@ -854,6 +857,86 @@ SELECT bqutil.fn.int(1.684) ## Documentation +### [anovafscore(data ARRAY>)](anovafscore.sqlx) +Returns the f-score of the ANOVA test on the arrays of stucts, having factor and value pairs. + +Input: +data: array of struct + > struct: string "factor", val "float64" + +Output: +The F value of the ANOVA score + +```sql +#standardSQL + +DECLARE data ARRAY>; +set data = (SELECT ARRAY(SELECT AS STRUCT species, sepal_length FROM `bigquery-public-data.ml_datasets.iris`)); +SELECT bqutils.fn.anovafscore(data) +``` + +results: + +``` +119.26450218450492 +``` +----- + +### [anovaftest(data ARRAY>)](anovaftest.sqlx) +Returns a struct of the p-value (p) and f-score (F) of the ANOVA test on the arrays of stucts, having factor and value pairs. + +Input: +data: array of struct + > struct: string "factor", val "float64" + +Output: +The F value of the ANOVA score + +```sql +#standardSQL + +DECLARE data ARRAY>; +set data = (SELECT ARRAY(SELECT AS STRUCT species, sepal_length FROM `bigquery-public-data.ml_datasets.iris`)); +SELECT bqutils.fn.anovaftest(data) +``` + +results: + +``` +{ + "results": { + "p": "0.0"", + "F"": "119.26450218450492" + } +} +``` +----- + +### [centralF_cdf(F FLOAT64, df1 INT64, df2 INT64)](centralF_cdf.sqlx) +Given x in the range [0, infinity), returns the cumulative probability density of the central F distribution. That is, jStat.centralF.cdf(2.5, 10, 20) will return the probability that a number randomly selected from the central F distribution with df1 = 10 and df2 = 20 will be less than 2.5. + +`df1` is the "numerator degrees of freedom" and `df2` is the "denominator degrees of freedom", which parameterize the distribtuion. + +This function corresponds to the pf(q, df1, df2) function in R. + +Input: +F: the statistic returned by `anovaftest` +df1: numerator degrees of freedom +df2: denominator degrees of freedom + +Output: +p-value the cumulative probability density of the central F distribution +```sql +SELECT bqutil.fn.centralF_cdf(2.5, 2, 40 - 2 - 1) +``` + +results: +``` +p: 0.15891301645400449 +``` +----- + + ### [corr_pvalue(r FLOAT64, n INT64)](corr_pvalue.sqlx) The returns the p value of the computed correlation coefficient based on the t-distribution. Input: diff --git a/udfs/community/anovafscore.sqlx b/udfs/community/anovafscore.sqlx new file mode 100644 index 000000000..4ac8f49dd --- /dev/null +++ b/udfs/community/anovafscore.sqlx @@ -0,0 +1,37 @@ +config { hasOutput: true } +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CREATE OR REPLACE FUNCTION ${self()}(data ARRAY>) AS (( + WITH raw_data AS ( + SELECT d.val AS v, d.factor as f + FROM UNNEST(data) AS d + ), + global_stats AS ( + SELECT AVG(v) as M, COUNT(*) - COUNT(DISTINCT f) as cnt FROM raw_data + ), + exp_var AS ( + SELECT COUNT(v) * POW(AVG(v) - (select M from global_stats), 2) as var, NULL as cnt, NULL as discnt FROM raw_data GROUP BY f + ), + unexp_var as ( + SELECT COUNT(v) * VAR_POP(v) as var FROM raw_data GROUP BY f + ) + + SELECT + (SELECT SUM(var) / (COUNT(var)-1) FROM exp_var) + / (SELECT SUM(var) / (select cnt from global_stats) FROM unexp_var) + +)); diff --git a/udfs/community/anovaftest.sqlx b/udfs/community/anovaftest.sqlx index 82e631fe2..be02f5572 100644 --- a/udfs/community/anovaftest.sqlx +++ b/udfs/community/anovaftest.sqlx @@ -15,23 +15,20 @@ config { hasOutput: true } * limitations under the License. */ + CREATE OR REPLACE FUNCTION ${self()}(data ARRAY>) AS (( WITH raw_data AS ( SELECT d.val AS v, d.factor as f FROM UNNEST(data) AS d ), global_stats AS ( - SELECT AVG(v) as M, COUNT(*) - COUNT(DISTINCT f) as cnt FROM raw_data - ), - exp_var AS ( - SELECT COUNT(v) * POW(AVG(v) - (select M from global_stats), 2) as var, NULL as cnt, NULL as discnt FROM raw_data GROUP BY f + SELECT COUNT(*) as N, COUNT(DISTINCT f)-1 as df1 FROM raw_data ), - unexp_var as ( - SELECT COUNT(v) * VAR_POP(v) as var FROM raw_data GROUP BY f + f_score as ( + (SELECT ${ref("anovafscore")}(data) as F) ) - SELECT - (SELECT SUM(var) / (COUNT(var)-1) FROM exp_var) - / (SELECT SUM(var) / (select cnt from global_stats) FROM unexp_var) + SELECT struct(cast(1 - ${ref("centralF_cdf")}(F, df1, N - df1 - 1) as FLOAT64) as p, F as F) from f_score, global_stats + )); \ No newline at end of file diff --git a/udfs/community/centralF_cdf.sqlx b/udfs/community/centralF_cdf.sqlx new file mode 100644 index 000000000..30aec9a75 --- /dev/null +++ b/udfs/community/centralF_cdf.sqlx @@ -0,0 +1,27 @@ +config { hasOutput: true } +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#standardSQL + +CREATE OR REPLACE FUNCTION ${self()}(F FLOAT64, dof1 INT64, dof2 INT64) +RETURNS FLOAT64 +LANGUAGE js AS """ + return 1.0 - jstat.jStat['centralF'].cdf(F, dof1, dof2) +""" +OPTIONS ( + library=["${JS_BUCKET}/jstat-v1.9.4.min.js"] +); + diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index 325b9eeaa..000fa1eb6 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -998,18 +998,35 @@ generate_udf_test("kruskal_wallis", [ expected_output: `STRUCT(CAST(3.423076923076927 AS FLOAT64) AS H, CAST( 0.1805877514841956 AS FLOAT64) AS p, CAST(2 AS INT64) AS DoF)` }, ]); -generate_udf_test("anovaftest", [ +generate_udf_test("anovafscore", [ { inputs: [`(SELECT [("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.2), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("virginica", 1.7), ("versicolor",1.6), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("versicolor",1.7), ("virginica", 1.9), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("setosa", 0.4), ("setosa", 0.4), ("versicolor",1.2), ("versicolor",1.4), ("versicolor",1.1), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.4), ("versicolor",1.8), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.2), ("virginica", 1.6), ("virginica", 1.8), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.5), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.4), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.4), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("virginica", 2.4), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 1.9), ("versicolor",1.6), ("virginica", 1.8), ("virginica", 2.4), ("virginica", 2.4), ("virginica", 1.4), ("virginica", 2.2), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.1), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("versicolor", 1), ("versicolor", 1.2), ("versicolor", 1.4), ("versicolor", 1.4), ("versicolor", 1.3), ("virginica", 2), ("versicolor", 1.5), ("versicolor", 1.5), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2), ("virginica", 2.3), ("setosa", 0.1), ("setosa", 0.6), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("versicolor", 1.3), ("versicolor", 1.5), ("versicolor", 1.2), ("versicolor", 1.3), ("versicolor", 1.3), ("versicolor", 1.5), ("versicolor", 1.2), ("versicolor", 1.4), ("versicolor", 1.4), ("versicolor", 1.6), ("virginica", 2), ("virginica", 2.3), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2), ("virginica", 2.2), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4) ])`], expected_output: `CAST(959.3244057257594 AS FLOAT64)` }, ]); +generate_udf_test("anovaftest", [ + { + inputs: [`(SELECT [("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.2), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("versicolor",1.5), ("virginica", 1.7), ("versicolor",1.6), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("versicolor",1.7), ("virginica", 1.9), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("setosa", 0.4), ("setosa", 0.4), ("versicolor",1.2), ("versicolor",1.4), ("versicolor",1.1), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.4), ("versicolor",1.8), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.2), ("virginica", 1.6), ("virginica", 1.8), ("virginica", 1.8), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.5), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.4), ("versicolor",1.0), ("versicolor",1.0), ("versicolor",1.1), ("versicolor",1.0), ("versicolor",1.3), ("versicolor",1.3), ("versicolor",1.5), ("versicolor",1.4), ("versicolor",1.3), ("virginica", 2.0), ("virginica", 1.5), ("virginica", 2.4), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 1.9), ("versicolor",1.6), ("virginica", 1.8), ("virginica", 2.4), ("virginica", 2.4), ("virginica", 1.4), ("virginica", 2.2), ("virginica", 2.1), ("virginica", 1.8), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 1.9), ("virginica", 2.1), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.1), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("versicolor", 1), ("versicolor", 1.2), ("versicolor", 1.4), ("versicolor", 1.4), ("versicolor", 1.3), ("virginica", 2), ("versicolor", 1.5), ("versicolor", 1.5), ("virginica", 1.8), ("virginica", 1.8), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2), ("virginica", 2.3), ("setosa", 0.1), ("setosa", 0.6), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4), ("versicolor", 1.3), ("versicolor", 1.5), ("versicolor", 1.2), ("versicolor", 1.3), ("versicolor", 1.3), ("versicolor", 1.5), ("versicolor", 1.2), ("versicolor", 1.4), ("versicolor", 1.4), ("versicolor", 1.6), ("virginica", 2), ("virginica", 2.3), ("virginica", 2.5), ("virginica", 2.3), ("virginica", 2.1), ("virginica", 2), ("virginica", 2.2), ("setosa", 0.3), ("setosa", 0.3), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.2), ("setosa", 0.4) ])`], + expected_output: `STRUCT(CAST(1.0 AS FLOAT64) AS p, CAST(959.3244057257594 AS FLOAT64) AS F)` + }, +]); generate_udf_test("linear_regression", [ { inputs: [`(SELECT [ (5.1,2.5), (5.0,2.0), (5.7,2.6), (6.0,2.2), (5.8,2.6), (5.5,2.3), (6.1,2.8), (5.5,2.5), (6.4,3.2), (5.6,3.0)])`], expected_output: `STRUCT(CAST(-0.4353361094588436 AS FLOAT64) AS a, CAST( 0.5300416418798544 AS FLOAT64) AS b, CAST(0.632366563565354 AS FLOAT64) AS r)` }, ]); +generate_udf_test("centralF_cdf", [ + { + inputs: [ + `CAST(0.9 AS FLOAT64)`, + `CAST(25 AS INT64)`, + `CAST(5 AS INT64)` + + ], + expected_output: `CAST(0.00004357692924095158 AS FLOAT64)` + }, +]); generate_udf_test("corr_pvalue", [ { inputs: [ From 9c533650c0a3d41109ff72a850f02d3dbf279192 Mon Sep 17 00:00:00 2001 From: Ross Thomson Date: Thu, 4 Aug 2022 13:15:54 -0400 Subject: [PATCH 104/104] updated link addresses --- udfs/community/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 13dda77e1..0cea844a8 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -13,6 +13,9 @@ SELECT bqutil.fn.int(1.684) ## UDFs +* [anovafscore](#anovafscoredata-arraystructfactor-string-val-float64) +* [anovaftest](#anovaftestdata-arraystructfactor-string-val-float64) +* [centralF_udf](#centralf_cdff-float64-df1-int64-df2-int64) * [chisquare_cdf](#chisquare_cdfh-float64-dof-float64) * [corr_pvalue](#corr_pvaluer-float64-n-int64) * [csv_to_struct](#csv_to_structstrlist-string) @@ -844,9 +847,9 @@ SELECT bqutil.fn.int(1.684) ``` ## UDFs -* [anovafscore](#corr_pvaluer-float64-n-int64) -* [anovaftest](#corr_pvaluer-float64-n-int64) -* [centralF_udf](#corr_pvaluer-float64-n-int64) +* [anovafscore](#anovafscoredata-arraystructfactor-string-val-float64) +* [anovaftest](#anovaftestdata-arraystructfactor-string-val-float64) +* [centralF_udf](#centralf_cdff-float64-df1-int64-df2-int64) * [corr_pvalue](#corr_pvaluer-float64-n-int64) * [kruskal_wallis](#kruskal_wallisarraystructfactor-string-val-float64) * [linear_regression](#linear_regressionarraystructstructx-float64-y-float64)