Skip to content

Commit c2c74dd

Browse files
authored
Merge pull request #15 from data-intuitive/develop
Use parquet file for genesDB and use single cell info for statistics calculation
2 parents 88e734c + 94988cc commit c2c74dd

File tree

5 files changed

+91
-77
lines changed

5 files changed

+91
-77
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ There's still a lot of work to be done on this (version numbers don't reflect ev
1111
| 5.0.0 | 4.0.10 | 0.11.1 | 2.4.7 |
1212
| 5.0.1 | 4.0.11 | 0.11.1 | 2.4.7 |
1313
| 5.1.0 | 4.1.1 | 0.11.1 | 2.4.7 |
14+
| 5.1.1 | 4.1.1 | 0.11.1 | 2.4.7 |
1415

1516
# API Documentation
1617

build.sbt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name := "LuciusAPI"
22

33
import aether.AetherKeys._
44

5-
ThisBuild / version := "5.1.0"
5+
ThisBuild / version := "5.1.1"
66

77
scalaVersion := "2.11.12"
88

config/example.conf

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,4 @@
66
geneAnnotations = "s3://..."
77
storageLevel = "MEMORY_ONLY_1"
88
partitions = 1
9-
geneFeatures {
10-
probesetID = probesetid,
11-
dataType = dataType,
12-
dataType2 = dataType2,
13-
ENTREZID = entrezid,
14-
ENSEMBL = ensemblid,
15-
SYMBOL = symbol,
16-
GENENAME = name,
17-
GENEFAMILY = geneFamily
18-
}
19-
geneDataType {
20-
"1-1" = "L1000"
21-
"0-1" = "BING"
22-
"0-0" = "AIG"
23-
"1-0" = "INVALID"
24-
}
259
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package com.dataintuitive.luciusapi
2+
3+
import com.dataintuitive.luciuscore._
4+
import model.v4_1._
5+
import genes._
6+
import api.v4_1._
7+
import io.GenesIO._
8+
import io.{ Version, DatedVersionedObject, State }
9+
10+
import org.apache.spark.sql.SparkSession
11+
import org.apache.spark.sql.functions.{lit, typedLit, concat, array, monotonicallyIncreasingId, col}
12+
13+
import org.apache.hadoop.fs.{FileSystem, Path}
14+
15+
object IO {
16+
17+
def getGenesDB(sparkSession: SparkSession, geneAnnotationsFile: String): GenesDB = {
18+
import sparkSession.implicits._
19+
20+
val genesRaw = sparkSession.read.parquet(geneAnnotationsFile)
21+
val genesMapped = genesRaw
22+
.drop("gene_description", "probe_set_id")
23+
.withColumn("index", monotonicallyIncreasingId.cast("integer"))
24+
.withColumnRenamed("gene_id", "id")
25+
.withColumn("entrezid", typedLit(Option.empty[Set[String]]))
26+
.withColumn("ensemblid", typedLit(Option.empty[Set[String]]))
27+
.withColumn("symbol", array($"gene_symbol"))
28+
.withColumn("name", typedLit(Option.empty[Set[String]]))
29+
.withColumn("geneFamily", typedLit(Option.empty[Set[String]]))
30+
.withColumn("dataType1", col("landmark").cast("integer"))
31+
.withColumn("dataType2", col("best_inferred").cast("integer"))
32+
.withColumn("dataType", concat(col("dataType1"), lit("-"), col("dataType2")))
33+
.as[Gene]
34+
.collect
35+
36+
genes.GenesDB(genesMapped)
37+
}
38+
39+
def allInput(sparkSession: SparkSession, path: List[String]):List[DatedVersionedObject[Path]] = {
40+
import sparkSession.implicits._
41+
42+
val fs = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)
43+
44+
val outputList =
45+
path.flatMap(p => {
46+
val pp = new Path(p)
47+
if (pp.toString.contains(".parquet"))
48+
List(pp)
49+
.map(x => (x.getName, x.getParent, x))
50+
else
51+
fs
52+
.listStatus(pp)
53+
.map(_.getPath)
54+
.map(x => (x.getName, x.getParent, x))
55+
.filter(_._1.toString() contains ".parquet")
56+
})
57+
val outputs = outputList.map{ case(name, path, fullPath) =>
58+
val p = sparkSession.read.parquet(fullPath.toString).as[Perturbation]
59+
val version:Version =
60+
p.first
61+
.meta
62+
.filter{ case MetaInformation(key, value) => key == "version"}
63+
.headOption
64+
.map(_.value)
65+
.map(Version(_))
66+
.getOrElse(Version(0,0))
67+
val dateStrO =
68+
p.first
69+
.meta
70+
.filter{ case MetaInformation(key, value) => key == "processingDate"}
71+
.headOption
72+
.map(_.value)
73+
val date = dateStrO.map(java.time.LocalDate.parse).getOrElse(java.time.LocalDate.MIN)
74+
DatedVersionedObject(date, version, fullPath)
75+
}.toList
76+
77+
outputs
78+
79+
}
80+
81+
82+
}

src/main/scala/com/dataintuitive/luciusapi/initialize.scala

Lines changed: 7 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ import model.v4_1._
55
import genes._
66
import api.v4_1._
77
import io.GenesIO._
8-
import io.{ Version, DatedVersionedObject, State }
9-
import lenses.CombinedPerturbationLenses.safeCellLens
8+
import io.State
9+
import lenses.CombinedPerturbationLenses.safeCellDetailsLens
1010

1111
import Common.ParamHandlers._
1212
import com.dataintuitive.jobserver._
@@ -28,17 +28,14 @@ import org.apache.spark.sql.Encoders
2828

2929
import org.apache.spark.storage.StorageLevel
3030
import org.apache.spark.storage.StorageLevel._
31-
import org.apache.hadoop.fs.{FileSystem, Path}
3231

3332
object initialize extends SparkSessionJob with NamedObjectSupport {
3433

3534
case class JobData(dbs: List[String],
3635
geneAnnotations: String,
3736
dbVersion: String,
3837
partitions: Int,
39-
storageLevel: StorageLevel,
40-
geneFeatures: Map[String, String],
41-
geneDataTypes: Map[String, String])
38+
storageLevel: StorageLevel)
4239
type JobOutput = collection.Map[String, Any]
4340

4441
override def validate(sparkSession: SparkSession,
@@ -50,10 +47,8 @@ object initialize extends SparkSessionJob with NamedObjectSupport {
5047
val dbVersion = paramDbVersion(config)
5148
val partitions = paramPartitions(config)
5249
val storageLevel = paramStorageLevel(config)
53-
val geneFeatures = paramGeneFeatures(config)
54-
val geneDataTypes = paramGeneDataTypes(config)
5550

56-
withGood(db, genes) { JobData(_, _, dbVersion, partitions, storageLevel, geneFeatures, geneDataTypes) }
51+
withGood(db, genes) { JobData(_, _, dbVersion, partitions, storageLevel) }
5752

5853
}
5954

@@ -81,60 +76,12 @@ object initialize extends SparkSessionJob with NamedObjectSupport {
8176
.set("fs.s3n.awsSecretAccessKey", fs_s3_awsSecretAccessKey)
8277

8378
// Loading gene annotations and broadcast
84-
val genes =
85-
loadGenesFromFile(sparkSession.sparkContext, data.geneAnnotations, delimiter="\t", dict = data.geneFeatures, dataTypeDict = data.geneDataTypes)
86-
val genesDB = new GenesDB(genes)
79+
val genesDB = IO.getGenesDB(sparkSession, data.geneAnnotations)
8780
val genesBC = sparkSession.sparkContext.broadcast(genesDB)
8881

8982
runtime.namedObjects.update("genes", NamedBroadcast(genesBC))
9083

91-
// Add inline, should be moved elsewhere --- START
92-
93-
def allInput(sparkSession: SparkSession, path: List[String]):List[DatedVersionedObject[Path]] = {
94-
import sparkSession.implicits._
95-
96-
val fs = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)
97-
98-
val outputList =
99-
path.flatMap(p => {
100-
val pp = new Path(p)
101-
if (pp.toString.contains(".parquet"))
102-
List(pp)
103-
.map(x => (x.getName, x.getParent, x))
104-
else
105-
fs
106-
.listStatus(pp)
107-
.map(_.getPath)
108-
.map(x => (x.getName, x.getParent, x))
109-
.filter(_._1.toString() contains ".parquet")
110-
})
111-
val outputs = outputList.map{ case(name, path, fullPath) =>
112-
val p = sparkSession.read.parquet(fullPath.toString).as[Perturbation]
113-
val version:Version =
114-
p.first
115-
.meta
116-
.filter{ case MetaInformation(key, value) => key == "version"}
117-
.headOption
118-
.map(_.value)
119-
.map(Version(_))
120-
.getOrElse(Version(0,0))
121-
val dateStrO =
122-
p.first
123-
.meta
124-
.filter{ case MetaInformation(key, value) => key == "processingDate"}
125-
.headOption
126-
.map(_.value)
127-
val date = dateStrO.map(java.time.LocalDate.parse).getOrElse(java.time.LocalDate.MIN)
128-
DatedVersionedObject(date, version, fullPath)
129-
}.toList
130-
131-
outputs
132-
133-
}
134-
135-
// END
136-
137-
val outputs = allInput(sparkSession, data.dbs)
84+
val outputs = IO.allInput(sparkSession, data.dbs)
13885
val state = State(outputs)
13986

14087
val thisVersion = state.state.filter(_.version.major.toString == data.dbVersion)
@@ -165,7 +112,7 @@ object initialize extends SparkSessionJob with NamedObjectSupport {
165112
val flatDb = db.map( row =>
166113
FlatDbRow(
167114
row.id,
168-
safeCellLens.get(row),
115+
safeCellDetailsLens.get(row).head,
169116
row.trt.trt_cp.map(_.dose).getOrElse("N/A"),
170117
row.trtType,
171118
row.trt.trt.name,

0 commit comments

Comments
 (0)