Merge pull request #15 from data-intuitive/develop

hcannoodt · web-flow · commit c2c74dd599c2 · 2022-11-23T13:22:19.000+01:00
Use parquet file for genesDB and use single cell info for statistics calculation
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ There's still a lot of work to be done on this (version numbers don't reflect ev
 | 5.0.0     | 4.0.10     | 0.11.1          | 2.4.7 |
 | 5.0.1     | 4.0.11     | 0.11.1          | 2.4.7 |
 | 5.1.0     | 4.1.1      | 0.11.1          | 2.4.7 |
+| 5.1.1     | 4.1.1      | 0.11.1          | 2.4.7 |
 
 # API Documentation
 
diff --git a/build.sbt b/build.sbt
@@ -2,7 +2,7 @@ name := "LuciusAPI"
 
 import aether.AetherKeys._
 
-ThisBuild / version := "5.1.0"
+ThisBuild / version := "5.1.1"
 
 scalaVersion := "2.11.12"
 
diff --git a/config/example.conf b/config/example.conf
@@ -6,20 +6,4 @@
   geneAnnotations = "s3://..."
   storageLevel = "MEMORY_ONLY_1"
   partitions = 1
-  geneFeatures {
-    probesetID = probesetid,
-    dataType = dataType,
-    dataType2 = dataType2,
-    ENTREZID = entrezid,
-    ENSEMBL = ensemblid,
-    SYMBOL = symbol,
-    GENENAME = name,
-    GENEFAMILY = geneFamily
-  }
-    geneDataType {
-  	"1-1" = "L1000"
-  	"0-1" = "BING"
-  	"0-0" = "AIG"
-  	"1-0" = "INVALID"
-  }
 }
diff --git a/src/main/scala/com/dataintuitive/luciusapi/IO.scala b/src/main/scala/com/dataintuitive/luciusapi/IO.scala
@@ -0,0 +1,82 @@
+package com.dataintuitive.luciusapi
+
+import com.dataintuitive.luciuscore._
+import model.v4_1._
+import genes._
+import api.v4_1._
+import io.GenesIO._
+import io.{ Version, DatedVersionedObject, State }
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.{lit, typedLit, concat, array, monotonicallyIncreasingId, col}
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+object IO {
+
+  def getGenesDB(sparkSession: SparkSession, geneAnnotationsFile: String): GenesDB = {
+    import sparkSession.implicits._
+    
+    val genesRaw = sparkSession.read.parquet(geneAnnotationsFile)
+    val genesMapped = genesRaw
+      .drop("gene_description", "probe_set_id")
+      .withColumn("index", monotonicallyIncreasingId.cast("integer"))
+      .withColumnRenamed("gene_id", "id")
+      .withColumn("entrezid", typedLit(Option.empty[Set[String]]))
+      .withColumn("ensemblid", typedLit(Option.empty[Set[String]]))
+      .withColumn("symbol", array($"gene_symbol"))
+      .withColumn("name", typedLit(Option.empty[Set[String]]))
+      .withColumn("geneFamily", typedLit(Option.empty[Set[String]]))
+      .withColumn("dataType1", col("landmark").cast("integer"))
+      .withColumn("dataType2", col("best_inferred").cast("integer"))
+      .withColumn("dataType", concat(col("dataType1"), lit("-"), col("dataType2")))
+      .as[Gene]
+      .collect
+
+    genes.GenesDB(genesMapped)
+  }
+
+  def allInput(sparkSession: SparkSession, path: List[String]):List[DatedVersionedObject[Path]] = {
+    import sparkSession.implicits._
+
+    val fs = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)
+
+    val outputList =
+      path.flatMap(p => {
+        val pp = new Path(p)
+        if (pp.toString.contains(".parquet"))
+          List(pp)
+            .map(x => (x.getName, x.getParent, x))
+        else
+          fs
+            .listStatus(pp)
+            .map(_.getPath)
+            .map(x => (x.getName, x.getParent, x))
+            .filter(_._1.toString() contains ".parquet")
+      })
+    val outputs = outputList.map{ case(name, path, fullPath) =>
+      val p = sparkSession.read.parquet(fullPath.toString).as[Perturbation]
+      val version:Version =
+        p.first
+          .meta
+          .filter{ case MetaInformation(key, value) => key == "version"}
+          .headOption
+          .map(_.value)
+          .map(Version(_))
+          .getOrElse(Version(0,0))
+      val dateStrO =
+        p.first
+          .meta
+          .filter{ case MetaInformation(key, value) => key == "processingDate"}
+          .headOption
+          .map(_.value)
+      val date = dateStrO.map(java.time.LocalDate.parse).getOrElse(java.time.LocalDate.MIN)
+      DatedVersionedObject(date, version, fullPath)
+    }.toList
+
+    outputs
+
+  }
+
+
+}
diff --git a/src/main/scala/com/dataintuitive/luciusapi/initialize.scala b/src/main/scala/com/dataintuitive/luciusapi/initialize.scala
@@ -5,8 +5,8 @@ import model.v4_1._
 import genes._
 import api.v4_1._
 import io.GenesIO._
-import io.{ Version, DatedVersionedObject, State }
-import lenses.CombinedPerturbationLenses.safeCellLens
+import io.State
+import lenses.CombinedPerturbationLenses.safeCellDetailsLens
 
 import Common.ParamHandlers._
 import com.dataintuitive.jobserver._
@@ -28,17 +28,14 @@ import org.apache.spark.sql.Encoders
 
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.storage.StorageLevel._
-import org.apache.hadoop.fs.{FileSystem, Path}
 
 object initialize extends SparkSessionJob with NamedObjectSupport {
 
   case class JobData(dbs: List[String],
                      geneAnnotations: String,
                      dbVersion: String,
                      partitions: Int,
-                     storageLevel: StorageLevel,
-                     geneFeatures: Map[String, String],
-                     geneDataTypes: Map[String, String])
+                     storageLevel: StorageLevel)
   type JobOutput = collection.Map[String, Any]
 
   override def validate(sparkSession: SparkSession,
@@ -50,10 +47,8 @@ object initialize extends SparkSessionJob with NamedObjectSupport {
     val dbVersion = paramDbVersion(config)
     val partitions = paramPartitions(config)
     val storageLevel = paramStorageLevel(config)
-    val geneFeatures = paramGeneFeatures(config)
-    val geneDataTypes = paramGeneDataTypes(config)
 
-    withGood(db, genes) { JobData(_, _, dbVersion, partitions, storageLevel, geneFeatures, geneDataTypes) }
+    withGood(db, genes) { JobData(_, _, dbVersion, partitions, storageLevel) }
 
   }
 
@@ -81,60 +76,12 @@ object initialize extends SparkSessionJob with NamedObjectSupport {
       .set("fs.s3n.awsSecretAccessKey", fs_s3_awsSecretAccessKey)
 
     // Loading gene annotations and broadcast
-    val genes =
-      loadGenesFromFile(sparkSession.sparkContext, data.geneAnnotations, delimiter="\t", dict = data.geneFeatures, dataTypeDict = data.geneDataTypes)
-    val genesDB = new GenesDB(genes)
+    val genesDB = IO.getGenesDB(sparkSession, data.geneAnnotations)
     val genesBC = sparkSession.sparkContext.broadcast(genesDB)
 
     runtime.namedObjects.update("genes", NamedBroadcast(genesBC))
 
-    // Add inline, should be moved elsewhere --- START
-
-    def allInput(sparkSession: SparkSession, path: List[String]):List[DatedVersionedObject[Path]] = {
-      import sparkSession.implicits._
-
-      val fs = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)
-
-      val outputList =
-        path.flatMap(p => {
-          val pp = new Path(p)
-          if (pp.toString.contains(".parquet"))
-            List(pp)
-              .map(x => (x.getName, x.getParent, x))
-          else
-            fs
-              .listStatus(pp)
-              .map(_.getPath)
-              .map(x => (x.getName, x.getParent, x))
-              .filter(_._1.toString() contains ".parquet")
-        })
-      val outputs = outputList.map{ case(name, path, fullPath) =>
-        val p = sparkSession.read.parquet(fullPath.toString).as[Perturbation]
-        val version:Version =
-          p.first
-            .meta
-            .filter{ case MetaInformation(key, value) => key == "version"}
-            .headOption
-            .map(_.value)
-            .map(Version(_))
-            .getOrElse(Version(0,0))
-        val dateStrO =
-          p.first
-            .meta
-            .filter{ case MetaInformation(key, value) => key == "processingDate"}
-            .headOption
-            .map(_.value)
-        val date = dateStrO.map(java.time.LocalDate.parse).getOrElse(java.time.LocalDate.MIN)
-        DatedVersionedObject(date, version, fullPath)
-      }.toList
-
-      outputs
-
-    }
-
-    // END
-
-    val outputs = allInput(sparkSession, data.dbs)
+    val outputs = IO.allInput(sparkSession, data.dbs)
     val state = State(outputs)
 
     val thisVersion = state.state.filter(_.version.major.toString == data.dbVersion)
@@ -165,7 +112,7 @@ object initialize extends SparkSessionJob with NamedObjectSupport {
     val flatDb = db.map( row =>
           FlatDbRow(
             row.id,
-            safeCellLens.get(row),
+            safeCellDetailsLens.get(row).head,
             row.trt.trt_cp.map(_.dose).getOrElse("N/A"),
             row.trtType,
             row.trt.trt.name,