-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
question ❓Further information is requestedFurther information is requested
Description
I ran the following command
Rscript "process-gds-file-with-dracarys.R" \
--out-dir /path/to/tempdir \
--file-prefix "multiqc_data" \
--file-type "MultiqcFile" \
--presigned-url "presigned_url"
Where the script takes in a presigned url and writes out a parquet file using the arrow library.
I moved away from csv in the hope that parquet would retain the data type, however I've noticed a lot of the values that should be ints are floats. Is this something that dracarys sets?
If I read this parquet file back into python with
df_pd = pd.read_parquet(
parquet_file_path
)
# Show an example column that should be an int
df_pd["reads_qcfail_dragen"]
0 0.0
1 0.0
Name: reads_qcfail_dragen, dtype: float64
I would expect this column to be an int, rather than a float?
Process gds-file-with dracarys.R
Click to show script
#!/usr/bin/env Rscript
# Load libraries
library("optparse")
library("logger")
suppressMessages(library("tidyverse"))
library("dracarys")
library("glue")
library("arrow")
# Functions
read_object <- function(dr_func_obj, data_type){
objp_read <- dr_func_obj$read()
if (data_type == 'TsoSampleAnalysisResultsFile'){
# Does not return a tibble but a list of items
# "sample_info"
# "software_config"
# "biomarkers"
# "qc"
# "snvs"
# "cnvs"
sample_id <- objp_read$sample_info %>%
dplyr::pull(sampleId)
return(
list(
"TsoQc" = (
objp_read$qc %>%
dplyr::mutate(sample_id = sample_id)
),
"TsoSnvs" = (
objp_read$snvs %>%
dplyr::mutate(sample_id = sample_id)
),
"TsoCnvs" = (
objp_read$cnvs %>%
dplyr::mutate(sample_id = sample_id)
)
)
)
} else {
# Return as a named list with the data type as the single name
return_list <- list()
return_list[[data_type]] <- objp_read
return(
return_list
)
}
}
# Get args
parser <- OptionParser(formatter=IndentedHelpFormatter)
# We write to an output directory since there may be may files output by Dracarys for the one file
# i.e SampleAnalysisResultsJson spits out multiple files
parser <- add_option(parser, "--out-dir", help="Output Directory")
parser <- add_option(parser, "--file-prefix", help="Filename prefix")
parser <- add_option(parser, "--presigned-url", help="Presigned URL to File")
parser <- add_option(parser, "--file-type", help="FileType to Collect")
# Read args
opt = parse_args(parser);
# Checks
# Check parameters are defined
if (is.null(opt[['out-dir']])){
logger::log_error("Please specify --out-dir parameter")
print_help(parser)
quit(status=1)
}
if (is.null(opt[['file-prefix']])){
logger::log_error("Please specify --file-prefix parameter")
print_help(parser)
quit(status=1)
}
if (is.null(opt[['presigned-url']])){
logger::log_error("Please specify --presigned-url parameter")
print_help(parser)
quit(status=1)
}
if (is.null(opt[['file-type']])){
logger::log_error("Please specify --file-type parameter")
print_help(parser)
quit(status=1)
}
# Check parent directory of output option exist
if (!dir.exists(opt[['out-dir']])){
logger::log_error(glue("Please ensure {opt[['out-dir']]} exists and try again"))
quit(status=1)
}
# Check access token
if (Sys.getenv("ICA_ACCESS_TOKEN", "") == ""){
logger::log_error("Could not get ICA_ACCESS_TOKEN from env var")
quit(status=1)
}
# Get function
function_name <- dracarys:::dr_func_eval(opt[["file-type"]])
# Generate data object from presigned url
data_obj <- function_name$new(opt[['presigned-url']])
# Read in object
data_obj_list <- read_object(data_obj, opt[["file-type"]])
# Iterate over object list
# Write to csv for each
for (data_type in names(data_obj_list)) {
# Get object from list
data_obj_tbl <- data_obj_list[[data_type]]
# Output file name
output_file_name <- file.path(
opt[['out-dir']],
paste0(opt[['file-prefix']], "__", data_type, ".parquet")
)
# Write out to csv
logger::log_info(glue("Writing out to parquet {output_file_name}"))
arrow::write_parquet(data_obj_tbl, sink = output_file_name)
logger::log_info("Writing output successful")
}brainstorm
Metadata
Metadata
Assignees
Labels
question ❓Further information is requestedFurther information is requested