- Start Spark shell:
$ spark-shell
- Do the necessary imports:
scala> import org.apache.spark.ml.regression.LinearRegression
scala> import org.apache.spark.ml.evaluation.RegressionEvaluator
scala> import org.apache.spark.ml.tuning.{ParamGridBuilder,
TrainValidationSplit}
- Load data as DataFrame:
scala> val data =
spark.read.format("libsvm").load
("s3a://sparkcookbook/housingdata/realestate.libsvm")
- Split data into training and test sets:
scala> val Array(training, test) = data.randomSplit
(Array(0.7, 0.3))
- Instantiate linear regression:
scala> val lr = new LinearRegression().setMaxIter(10)
- Create a parameter grid:
scala> val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1,0.01))
.addGrid(lr.fitIntercept)
.addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
.build()
- Create a training validation split:
scala> val trainValidationSplit = new TrainValidationSplit()
.setEstimator(lr)
.setEvaluator(new RegressionEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.8)
- Train the model:
scala> val model = trainValidationSplit.fit(training)
- Do the predictions on the test dataset:
scala> val predictions = model.transform(test)
- Evaluate the predictions:
scala> val evaluator = new RegressionEvaluator()
scala> evaluator.evaluate(predictions)