We will now take a look at the statistics of numeric features:
import org.apache.spark.sql.types._
val numericFeatures = trainDF.schema.filter(_.dataType != StringType)
val description = trainDF.describe(numericFeatures.map(_.name): _*)
val quantils = numericFeatures
.map(f=>trainDF.stat.approxQuantile(f.name,
Array(.25,.5,.75),0)).transposeval
rowSeq = Seq(Seq("q1"+:quantils(0): _*),
Seq("median"+:quantils(1): _*),
Seq("q3"+:quantils(2): _*))
val rows = rowSeq.map(s=> s match{
case Seq(a:String,b:Double,c:Double,d:Double,
e:Double,f:Double,g:Double,
h:Double,i:Double,j:Double,k:Double)=> (a,b,c,d,e,f,g,h,i,j,k)})
val allStats = description.unionAll(sc.parallelize(rows).toDF)
allStats.registerTempTable("allStats")
%sql select * from allStats
>>>
summary | age | duration | campaign | pdays | previous |
count | 41188.00 | 41188.00 | 41188.00 | 41188.00 | 41188.00 |
mean | 40.02 | 258.29 | 2.57 | 962.48 | 0.17 |
stddev | 10.42 | 259.28 | 2.77 | 186.91 | 0.49 |
min | 17.00 | 0.00 | 1.00 | 0.00 | 0.00 |
max | 98.00 | 4918.00 | 56.00 | 999.00 | 7.00 |
q1 | 32.00 | 102.00 | 1.00 | 999.00 | 0.00 |
median | 38.00 | 180.00 | 2.00 | 999.00 | 0.00 |
q3 | 47.00 | 319.00 | 3.00 | 999.00 | 0.00 |
summary | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed |
count | 41188.00 | 41188.00 | 41188.00 | 41188.00 | 41188.00 |
mean | 0.08 | 93.58 | -40.50 | 3.62 | 5167.04 |
stddev | 1.57 | 0.58 | 4.63 | 1.73 | 72.25 |
min | -3.40 | 92.20 | -50.80 | 0.63 | 4963.60 |
max | 1.40 | 94.77 | -26.90 | 5.05 | 5228.10 |
q1 | -1.80 | 93.08 | -42.70 | 1.34 | 5099.10 |
median | 1.10 | 93.75 | -41.80 | 4.86 | 5191.00 |
q3 | 1.40 | 93.99 | -36.40 | 4.96 | 5228.10 |