11package fr .polytechnique .cmap .cnam .filtering .cox
22
3- import org .apache .spark .sql .{DataFrame , Dataset }
43import org .apache .spark .sql .functions ._
54import org .apache .spark .sql .hive .HiveContext
6- import com . typesafe . config .{ Config , ConfigFactory }
5+ import org . apache . spark . sql .{ DataFrame , Dataset }
76import fr .polytechnique .cmap .cnam .Main
87import fr .polytechnique .cmap .cnam .filtering ._
98
@@ -14,26 +13,40 @@ object CoxMain extends Main {
1413
1514 def appName = " CoxFeaturing"
1615
17- def coxFeaturing (sqlContext : HiveContext ,
18- config : Config ,
19- cancerDefinition : String ,
20- filterDelayedPatients : Boolean ): Unit = {
21- import sqlContext .implicits ._
16+ def run (sqlContext : HiveContext , argsMap : Map [String , String ]): Option [Dataset [_]] = {
17+
18+ val flatEvents : Dataset [FlatEvent ] = FilteringMain .run(sqlContext, argsMap).get
19+ coxFeaturing(flatEvents, argsMap)
20+ }
21+
22+ def coxFeaturing (flatEvents : Dataset [FlatEvent ], argsMap : Map [String , String ]): Option [Dataset [_]] = {
23+ import flatEvents .sqlContext .implicits ._
2224
23- val flatEventPath = config.getString(" paths.input.flatEvent" )
24- val flatDcirPath = config.getString(" paths.input.flatDcir" )
25- val outputRoot = config.getString(" paths.output.root" )
25+ val sqlContext = flatEvents.sqlContext
26+
27+ argsMap.get(" conf" ).foreach(sqlContext.setConf(" conf" , _))
28+ argsMap.get(" env" ).foreach(sqlContext.setConf(" env" , _))
29+
30+ val cancerDefinition : String = FilteringConfig .cancerDefinition
31+ val filterDelayedPatients : Boolean = CoxConfig .filterDelayedPatients
32+ val outputRoot = FilteringConfig .outputPaths.coxFeatures
2633 val outputDir = s " $outputRoot/ $cancerDefinition/ $filterDelayedPatients"
2734
28- logger.info(s " Reading flat events from $flatEventPath... " )
35+ logger.info(" Running FilteringMain..." )
36+
37+ val dcirFlat : DataFrame = sqlContext.read.parquet(FilteringConfig .inputPaths.dcir)
2938
30- val dcirFlat : DataFrame = sqlContext.read.parquet(flatDcirPath )
31- val flatEvents : DataFrame = sqlContext.read.parquet(flatEventPath )
39+ val drugFlatEvents = flatEvents.filter(_.category == " molecule " )
40+ val diseaseFlatEvents = flatEvents.filter(_.category == " disease " )
3241
33- val drugFlatEvents = flatEvents.filter(col(" category" ) === " molecule" ).as[FlatEvent ]
34- val diseaseFlatEvents = flatEvents.filter(col(" category" ) === " disease" ).as[FlatEvent ]
35- val patientColumns = Array ($" patientID" , $" gender" , $" birthDate" , $" deathDate" )
36- val patients = flatEvents.select(patientColumns : _* ).distinct.as[Patient ]
42+ val patients : Dataset [Patient ] = flatEvents
43+ .map(
44+ x => Patient (
45+ x.patientID,
46+ x.gender,
47+ x.birthDate,
48+ x.deathDate)
49+ ).distinct
3750
3851 logger.info(" Number of drug events: " + drugFlatEvents.count)
3952 logger.info(" Caching disease events..." )
@@ -85,21 +98,7 @@ object CoxMain extends Main {
8598 import CoxFeaturesWriter ._
8699 coxFeatures.toDF.write.parquet(s " $outputDir/cox " )
87100 coxFeatures.writeCSV(s " $outputDir/cox.csv " )
88- }
89101
90- override def main (args : Array [String ]): Unit = {
91- startContext()
92- val (environment : String , cancerDefinition : String , filterDelayedPatients : Boolean ) =
93- args match {
94- case Array (arg1, args2, args3) => (args(0 ), args(1 ), args(2 ).toBoolean)
95- case Array (arg1, args2) => (args(0 ), args(1 ), true )
96- case _ => (" test" , " broad" , true )
97- }
98- val config : Config = ConfigFactory .parseResources(" config/filtering-default.conf" ).getConfig(environment)
99- coxFeaturing(sqlContext, config, cancerDefinition, filterDelayedPatients)
100- stopContext()
102+ Some (coxFeatures)
101103 }
102-
103- // todo: refactor this function
104- def run (sqlContext : HiveContext , argsMap : Map [String , String ]): Option [Dataset [_]] = None
105104}
0 commit comments