From 45b71a9a2db4de8fb1f31b8552ab07ad92ee8737 Mon Sep 17 00:00:00 2001
From: "Wu, Xiaochang" <xiaochang.wu@intel.com>
Date: Wed, 15 Jul 2020 15:21:02 +0800
Subject: [PATCH 01/11] Add XGBoost

---
 bin/functions/hibench_prop_env_mapping.py     |   8 ++
 bin/workloads/ml/xgboost/prepare/prepare.sh   |  35 +++++
 bin/workloads/ml/xgboost/spark/run.sh         |  34 +++++
 conf/workloads/ml/xgboost.conf                |  26 ++++
 sparkbench/ml/pom.xml                         |  10 ++
 .../com/intel/sparkbench/ml/XGBoost.scala     | 132 ++++++++++++++++++
 6 files changed, 245 insertions(+)
 create mode 100755 bin/workloads/ml/xgboost/prepare/prepare.sh
 create mode 100755 bin/workloads/ml/xgboost/spark/run.sh
 create mode 100644 conf/workloads/ml/xgboost.conf
 create mode 100644 sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala

diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
index 272faac2e..5e7f04f44 100644
--- a/bin/functions/hibench_prop_env_mapping.py
+++ b/bin/functions/hibench_prop_env_mapping.py
@@ -127,6 +127,14 @@
     MAX_BINS_GBT="hibench.gbt.maxBins",
     NUM_ITERATIONS_GBT="hibench.gbt.numIterations",
     LEARNING_RATE_GBT="hibench.gbt.learningRate",
+    # For XGBoost
+    NUM_EXAMPLES_XGBOOST="hibench.xgboost.examples",
+    NUM_FEATURES_XGBOOST="hibench.xgboost.features",
+    NUM_CLASSES_XGBOOST="hibench.xgboost.numClasses",
+    MAX_DEPTH_XGBOOST="hibench.xgboost.maxDepth",
+    MAX_BINS_XGBOOST="hibench.xgboost.maxBins",
+    NUM_ITERATIONS_XGBOOST="hibench.xgboost.numIterations",
+    LEARNING_RATE_XGBOOST="hibench.xgboost.learningRate",
     # For Random Forest
     NUM_EXAMPLES_RF="hibench.rf.examples",
     NUM_FEATURES_RF="hibench.rf.features",
diff --git a/bin/workloads/ml/xgboost/prepare/prepare.sh b/bin/workloads/ml/xgboost/prepare/prepare.sh
new file mode 100755
index 000000000..9506ef163
--- /dev/null
+++ b/bin/workloads/ml/xgboost/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/xgboost.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench XGBoostDataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostedTreeDataGenerator $INPUT_HDFS $NUM_EXAMPLES_XGBOOST $NUM_FEATURES_XGBOOST
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/xgboost/spark/run.sh b/bin/workloads/ml/xgboost/spark/run.sh
new file mode 100755
index 000000000..76d9c2caf
--- /dev/null
+++ b/bin/workloads/ml/xgboost/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/xgboost.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench XGBoost ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.XGBoost --numClasses $NUM_CLASSES_XGBOOST --maxDepth $MAX_DEPTH_XGBOOST --maxBins $MAX_BINS_XGBOOST --numIterations $NUM_ITERATIONS_XGBOOST --learningRate $LEARNING_RATE_XGBOOST $INPUT_HDFS
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf
new file mode 100644
index 000000000..7788f6242
--- /dev/null
+++ b/conf/workloads/ml/xgboost.conf
@@ -0,0 +1,26 @@
+hibench.xgboost.tiny.examples               10
+hibench.xgboost.tiny.features               100
+hibench.xgboost.small.examples              100
+hibench.xgboost.small.features              500
+hibench.xgboost.large.examples              1000
+hibench.xgboost.large.features              2000
+hibench.xgboost.huge.examples               1000
+hibench.xgboost.huge.features               4000
+hibench.xgboost.gigantic.examples           1000
+hibench.xgboost.gigantic.features           8000
+hibench.xgboost.bigdata.examples            1000
+hibench.xgboost.bigdata.features            12000
+
+
+hibench.xgboost.examples                    ${hibench.xgboost.${hibench.scale.profile}.examples}
+hibench.xgboost.features                    ${hibench.xgboost.${hibench.scale.profile}.features}
+hibench.xgboost.partitions                  ${hibench.default.map.parallelism}
+
+hibench.xgboost.numClasses                  2
+hibench.xgboost.maxDepth                    30
+hibench.xgboost.maxBins                     32
+hibench.xgboost.numIterations               20
+hibench.xgboost.learningRate                0.1
+
+hibench.workload.input                  ${hibench.hdfs.data.dir}/XGBoost/Input
+hibench.workload.output                 ${hibench.hdfs.data.dir}/XGBoost/Output
diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml
index d57898a56..44ae66cc9 100644
--- a/sparkbench/ml/pom.xml
+++ b/sparkbench/ml/pom.xml
@@ -53,5 +53,15 @@
       <artifactId>mahout-math</artifactId>
       <version>${mahout.version}</version>
     </dependency>
+      <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+        <version>1.0.0</version>
+      </dependency>
+      <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+        <version>1.0.0</version>
+      </dependency>
   </dependencies>
 </project>
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
new file mode 100644
index 000000000..349b34b9a
--- /dev/null
+++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.tree.GradientBoostedTrees
+import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
+import org.apache.spark.sql.SparkSession
+import scopt.OptionParser
+
+object XGBoost {
+
+  case class Params(
+    numClasses: Int = 2,
+    maxDepth: Int = 30,
+    maxBins: Int = 32,
+    numIterations: Int = 20,
+    learningRate: Double = 0.1,
+    dataPath: String = null
+  )
+
+  def main(args: Array[String]): Unit = {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("XGBoost"){
+      head("XGBoost: use XGBoost for classification")
+      opt[Int]("numClasses")
+        .text(s"numClasses, default: ${defaultParams.numClasses}")
+        .action((x,c) => c.copy(numClasses = x))
+      opt[Int]("maxDepth")
+        .text(s"maxDepth, default: ${defaultParams.maxDepth}")
+        .action((x,c) => c.copy(maxDepth = x))
+      opt[Int]("maxBins")
+        .text(s"maxBins, default: ${defaultParams.maxBins}")
+        .action((x,c) => c.copy(maxBins = x))
+      opt[Int]("numIterations")
+        .text(s"numIterations, default: ${defaultParams.numIterations}")
+        .action((x,c) => c.copy(numIterations = x))
+      opt[Double]("learningRate")
+        .text(s"learningRate, default: ${defaultParams.learningRate}")
+        .action((x,c) => c.copy(learningRate = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("data path for XGBoost")
+        .action((x,c) => c.copy(dataPath = x))
+    }
+    parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
+
+  def run(params: Params): Unit = {
+
+    val spark = SparkSession
+      .builder
+      .appName(s"XGBoost with $params")
+      .getOrCreate()
+
+    val sc = spark.sparkContext
+
+    import spark.implicits._
+
+    val dataPath = params.dataPath
+    val numClasses = params.numClasses
+    val maxDepth = params.maxDepth
+    val maxBins = params.maxBins
+    val numIterations = params.numIterations
+    val learningRate = params.learningRate
+
+    // Load data file.
+    val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath)
+    // Convert to ML LabeledPoint and to DataFrame
+    val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) }
+    val data = mlRDD.toDF
+
+    // Split the data into training and test sets (30% held out for testing)
+    val splits = data.randomSplit(Array(0.7, 0.3))
+    val (trainingData, testData) = (splits(0), splits(1))
+
+    val numWorkers = sc.getConf.getInt("spark.executor.instances", 2)
+    val numThreads = sc.getConf.getInt("spark.executor.cores", 1)
+
+    val xgbParam = Map("eta" -> learningRate,
+      "num_round" -> numIterations,
+      "eta" -> learningRate,
+      "num_class" -> numClasses,
+      "max_depth" -> maxDepth,
+      "max_bin" -> maxBins,
+      "objective" -> "multi:softprob"
+    )
+    val xgbClassifier = new XGBoostClassifier(xgbParam).
+      setFeaturesCol("features").
+      setLabelCol("label")
+
+    val model = xgbClassifier.fit(trainingData)
+
+    // Make predictions.
+    val predictions = model.transform(testData)
+
+    // Select (prediction, true label) and compute test error.
+    val evaluator = new MulticlassClassificationEvaluator()
+      .setLabelCol("label")
+      .setPredictionCol("prediction")
+      .setMetricName("accuracy")
+    val accuracy = evaluator.evaluate(predictions)
+    println(s"Test Error = ${1.0 - accuracy}")
+
+    sc.stop()
+  }
+}

From b81f847435778902fa71d45d3ee1a3b74546d844 Mon Sep 17 00:00:00 2001
From: "Wu, Xiaochang" <xiaochang.wu@intel.com>
Date: Wed, 15 Jul 2020 15:23:54 +0800
Subject: [PATCH 02/11] Clean up code

---
 .../ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
index 349b34b9a..99e7e9115 100644
--- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
+++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
@@ -99,9 +99,6 @@ object XGBoost {
     val splits = data.randomSplit(Array(0.7, 0.3))
     val (trainingData, testData) = (splits(0), splits(1))
 
-    val numWorkers = sc.getConf.getInt("spark.executor.instances", 2)
-    val numThreads = sc.getConf.getInt("spark.executor.cores", 1)
-
     val xgbParam = Map("eta" -> learningRate,
       "num_round" -> numIterations,
       "eta" -> learningRate,

From 6210f925593718800133914e6c5e217c0123307e Mon Sep 17 00:00:00 2001
From: "Wu, Xiaochang" <xiaochang.wu@intel.com>
Date: Wed, 15 Jul 2020 16:32:10 +0800
Subject: [PATCH 03/11] Add num_workers and nthread params and check
 spark.task.cpus

---
 .../scala/com/intel/sparkbench/ml/XGBoost.scala    | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
index 99e7e9115..9f68e5abd 100644
--- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
+++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
@@ -99,13 +99,25 @@ object XGBoost {
     val splits = data.randomSplit(Array(0.7, 0.3))
     val (trainingData, testData) = (splits(0), splits(1))
 
+    val numWorkers = sc.getConf.getInt("spark.executor.instances", -1)
+    val numThreads = sc.getConf.getInt("spark.executor.cores", -1)
+    val taskCPUs = sc.getConf.getInt("spark.task.cpus", -1)
+
+    if (numWorkers == -1 || numThreads == -1 || taskCPUs == -1) {
+      println("XGBoost error: should set spark.executor.instances, " +
+        "spark.executor.cores and spark.task.cpus in Spark Config")
+      sys.exit(1)
+    }
+
     val xgbParam = Map("eta" -> learningRate,
       "num_round" -> numIterations,
       "eta" -> learningRate,
       "num_class" -> numClasses,
       "max_depth" -> maxDepth,
       "max_bin" -> maxBins,
-      "objective" -> "multi:softprob"
+      "objective" -> "multi:softprob",
+      "num_workers" -> numWorkers,
+      "nthread" -> numThreads
     )
     val xgbClassifier = new XGBoostClassifier(xgbParam).
       setFeaturesCol("features").

From 85851a6433ad3ada273d81d9c582d0b362626c02 Mon Sep 17 00:00:00 2001
From: "Wu, Xiaochang" <xiaochang.wu@intel.com>
Date: Fri, 17 Jul 2020 12:05:43 +0800
Subject: [PATCH 04/11] modify README

---
 README.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 8217280bf..222870e88 100644
--- a/README.md
+++ b/README.md
@@ -74,27 +74,31 @@ There are totally 27 workloads in HiBench. The workloads are divided into 6 cate
 
     Gradient-boosted trees (GBT) is a popular regression method using ensembles of decision trees. This workload is implemented in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.
 
-6. Linear Regression (Linear)
+6. XGBoost (XGB)
+
+    XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. This workload is implemented with XGBoost4J-Spark API in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.    
+
+7. Linear Regression (Linear)
 
     Linear Regression (Linear) is a workload that implemented in spark.ml with ElasticNet. The input data set is generated by LinearRegressionDataGenerator.
 
-7. Latent Dirichlet Allocation (LDA)
+8. Latent Dirichlet Allocation (LDA)
 
     Latent Dirichlet allocation (LDA) is a topic model which infers topics from a collection of text documents. This workload is implemented in spark.mllib and the input data set is generated by LDADataGenerator.
 
-8. Principal Components Analysis (PCA)
+9. Principal Components Analysis (PCA)
 
     Principal component analysis (PCA) is a statistical method to find a rotation such that the first coordinate has the largest variance possible, and each succeeding coordinate in turn has the largest variance possible. PCA is used widely in dimensionality reduction. This workload is implemented in spark.ml. The input data set is generated by PCADataGenerator.
 
-9. Random Forest (RF)
+10. Random Forest (RF)
 
     Random forests (RF) are ensembles of decision trees. Random forests are one of the most successful machine learning models for classification and regression. They combine many decision trees in order to reduce the risk of overfitting. This workload is implemented in spark.mllib and the input data set is generated by RandomForestDataGenerator.
 
-10. Support Vector Machine (SVM)
+11. Support Vector Machine (SVM)
 
     Support Vector Machine (SVM) is a standard method for large-scale classification tasks. This workload is implemented in spark.mllib and the input data set is generated by SVMDataGenerator.
 
-11. Singular Value Decomposition (SVD)
+12. Singular Value Decomposition (SVD)
 
     Singular value decomposition (SVD) factorizes a matrix into three matrices. This workload is implemented in spark.mllib and its input data set is generated by SVDDataGenerator.
 

From 93b663bd6e55479eb403f3e908d854c815c29b0b Mon Sep 17 00:00:00 2001
From: Xiaochang Wu <xiaochang.wu@intel.com>
Date: Fri, 17 Jul 2020 21:59:24 +0800
Subject: [PATCH 05/11] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 222870e88..ec4d114d3 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ There are totally 27 workloads in HiBench. The workloads are divided into 6 cate
 
     Gradient-boosted trees (GBT) is a popular regression method using ensembles of decision trees. This workload is implemented in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.
 
-6. XGBoost (XGB)
+6. XGBoost (XGBoost)
 
     XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. This workload is implemented with XGBoost4J-Spark API in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.    
 

From 31c3b5993e78c5bc5972ccedd7daaf6f7caa98c3 Mon Sep 17 00:00:00 2001
From: "Jiang, Bo" <bo.b.jiang@intel.com>
Date: Thu, 30 Jul 2020 22:32:53 +0800
Subject: [PATCH 06/11] update xgboost configuration to use v1.1.0-scala2.12 by
 default

---
 conf/hibench.conf              |  2 +-
 conf/workloads/ml/xgboost.conf |  2 +-
 docs/run-sparkbench.md         | 56 ++++++++++++++++++++++++++++++++++
 pom.xml                        | 11 +++++++
 sparkbench/ml/pom.xml          |  4 +--
 5 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/conf/hibench.conf b/conf/hibench.conf
index b825997c0..867ec9eb5 100644
--- a/conf/hibench.conf
+++ b/conf/hibench.conf
@@ -1,6 +1,6 @@
 # Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata.
 # The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf
-hibench.scale.profile                tiny
+hibench.scale.profile                large
 # Mapper number in hadoop, partition number in Spark
 hibench.default.map.parallelism         8
 
diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf
index 7788f6242..9c04e6da6 100644
--- a/conf/workloads/ml/xgboost.conf
+++ b/conf/workloads/ml/xgboost.conf
@@ -17,7 +17,7 @@ hibench.xgboost.features                    ${hibench.xgboost.${hibench.scale.pr
 hibench.xgboost.partitions                  ${hibench.default.map.parallelism}
 
 hibench.xgboost.numClasses                  2
-hibench.xgboost.maxDepth                    30
+hibench.xgboost.maxDepth                    8
 hibench.xgboost.maxBins                     32
 hibench.xgboost.numIterations               20
 hibench.xgboost.learningRate                0.1
diff --git a/docs/run-sparkbench.md b/docs/run-sparkbench.md
index 459bb3e94..3cf9cf676 100644
--- a/docs/run-sparkbench.md
+++ b/docs/run-sparkbench.md
@@ -88,3 +88,59 @@ hibench.yarn.executor.num   |   Spark executor number in Yarn mode
 hibench.yarn.executor.cores  |  Spark executor cores in Yarn mode
 spark.executor.memory  | Spark executor memory
 spark.driver.memory    | Spark driver memory
+
+
+### 8. Run xgboost workload ###
+
+Hibench xgboost benchmark depends on the xgboost libraries to build and run. The libs are ```xgboost4j_<scala version>-<xgboost version>.jar``` and ```xgboost4j-spark_<scala version>-<xgboost version>.jar```.<br>
+The relevant configurations are in ```./sparkbench/ml/pom.xml```
+```
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+```
+and ```./pom.xml```
+```
+    <repository>
+      <id>xgboostrepo</id>
+      <name>XGBoost Maven Repo</name>
+      <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release</url>
+      <releases>
+        <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
+```
+
+#### 8.a latest xgboost release (default) ####
+
+By default, the hibench xgboost benchmark is configured to use the latest xgboost release from https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release.<br>
+To use it, simply build hibench, prepare data and run xgboost benchmark. For example,
+```
+$ mvn -Psparkbench -Dmodules -Pml -Dspark=2.4 -Dscala=2.12 clean package
+$ bin/workloads/ml/xgboost/prepare/prepare.sh && hdfs dfs -du -s -h /HiBench/XGBoost/Input
+$ bin/workloads/ml/xgboost/spark/run.sh
+```
+
+#### 8.b other xgboost releases ####
+
+To use other xgboost releases, change the xgboost versions for xgboost4j and xgboost4j-spark to the target versions in ```./sparkbench/ml/pom.xml```. The ```scala.binary.version``` can be specified by command line parameter ```-Dscala```.<br>
+e.g. to use xgboost v1.0.0, change ```<version>1.1.0</version>``` to ```<version>1.0.0</version>``` for both xgboost4j and xgboost4j-spark.<br>
+If the xgboost release is from other maven repo, update the xgboostrepo url in ```./pom.xml``` as well.<br>
+After that, build hibench, prepare data and run xgboost benchmark.
+
+#### 8.c xgboost jar files ####
+
+If you only have the xgboost jar files, just copy them to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in sparkbench/ml/pom.xml to get aligned.<br>
+For example, if xgboost is built from source on a Linux platform, the jars will be generated and installed to ```~/.m2/repository/ml/dmlc/xgboost4j_<scala version>/<xgboost version>-SNAPSHOT/``` and ```~/.m2/repository/ml/dmlc/xgboost4j-spark_<scala version>/<xgboost version>-SNAPSHOT/``` respectively. To use them, copy the 2 jars to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in the pom.xml files.<br>
+After that, build hibench, prepare data and run xgboost benchmark.
+
diff --git a/pom.xml b/pom.xml
index f93d602c9..a80101570 100644
--- a/pom.xml
+++ b/pom.xml
@@ -80,6 +80,17 @@
       <name>Scala-tools Maven 2 Repository</name>
       <url>https://oss.sonatype.org/content/groups/scala-tools/</url>
     </repository>
+    <repository>
+      <id>xgboostrepo</id>
+      <name>XGBoost Maven Repo</name>
+      <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release</url>
+      <releases>
+	      <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>
diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml
index 44ae66cc9..32075c625 100644
--- a/sparkbench/ml/pom.xml
+++ b/sparkbench/ml/pom.xml
@@ -56,12 +56,12 @@
       <dependency>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-        <version>1.0.0</version>
+        <version>1.1.0</version>
       </dependency>
       <dependency>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-        <version>1.0.0</version>
+        <version>1.1.0</version>
       </dependency>
   </dependencies>
 </project>

From 5791a0fec320be597b89ffa4a2e495e399b524f0 Mon Sep 17 00:00:00 2001
From: "Jiang, Bo" <bo.b.jiang@intel.com>
Date: Thu, 30 Jul 2020 23:37:09 +0800
Subject: [PATCH 07/11] refine XGBoost.scala, use pipeline

---
 .../scala/com/intel/sparkbench/ml/XGBoost.scala   | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
index 9f68e5abd..f497a1613 100644
--- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
+++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
@@ -18,11 +18,12 @@
 package com.intel.hibench.sparkbench.ml
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.tree.GradientBoostedTrees
-import org.apache.spark.mllib.tree.configuration.BoostingStrategy
-import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
+// import org.apache.spark.mllib.tree.GradientBoostedTrees
+// import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+// import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
@@ -33,7 +34,7 @@ object XGBoost {
 
   case class Params(
     numClasses: Int = 2,
-    maxDepth: Int = 30,
+    maxDepth: Int = 8,
     maxBins: Int = 32,
     numIterations: Int = 20,
     learningRate: Double = 0.1,
@@ -93,7 +94,7 @@ object XGBoost {
     val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath)
     // Convert to ML LabeledPoint and to DataFrame
     val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) }
-    val data = mlRDD.toDF
+    val data = mlRDD.toDF("label", "features")
 
     // Split the data into training and test sets (30% held out for testing)
     val splits = data.randomSplit(Array(0.7, 0.3))
@@ -123,7 +124,9 @@ object XGBoost {
       setFeaturesCol("features").
       setLabelCol("label")
 
-    val model = xgbClassifier.fit(trainingData)
+    val pipeline = new Pipeline().setStages(Array(xgbClassifier))
+
+    val model = pipeline.fit(trainingData)
 
     // Make predictions.
     val predictions = model.transform(testData)

From 38fc921b90a89296cfbed56da59af90e0b0103a2 Mon Sep 17 00:00:00 2001
From: bobjiang82 <bo.b.jiang@intel.com>
Date: Fri, 7 Aug 2020 09:28:14 +0800
Subject: [PATCH 08/11] Update hibench.conf

---
 conf/hibench.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/hibench.conf b/conf/hibench.conf
index 867ec9eb5..b825997c0 100644
--- a/conf/hibench.conf
+++ b/conf/hibench.conf
@@ -1,6 +1,6 @@
 # Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata.
 # The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf
-hibench.scale.profile                large
+hibench.scale.profile                tiny
 # Mapper number in hadoop, partition number in Spark
 hibench.default.map.parallelism         8
 

From a213d7536f4ce7323959ac97df13fee0d094ec50 Mon Sep 17 00:00:00 2001
From: bobjiang82 <bo.b.jiang@intel.com>
Date: Fri, 7 Aug 2020 10:46:00 +0800
Subject: [PATCH 09/11] Update run-sparkbench.md

commit code first and continue to refine doc.
---
 docs/run-sparkbench.md | 56 ------------------------------------------
 1 file changed, 56 deletions(-)

diff --git a/docs/run-sparkbench.md b/docs/run-sparkbench.md
index 3cf9cf676..459bb3e94 100644
--- a/docs/run-sparkbench.md
+++ b/docs/run-sparkbench.md
@@ -88,59 +88,3 @@ hibench.yarn.executor.num   |   Spark executor number in Yarn mode
 hibench.yarn.executor.cores  |  Spark executor cores in Yarn mode
 spark.executor.memory  | Spark executor memory
 spark.driver.memory    | Spark driver memory
-
-
-### 8. Run xgboost workload ###
-
-Hibench xgboost benchmark depends on the xgboost libraries to build and run. The libs are ```xgboost4j_<scala version>-<xgboost version>.jar``` and ```xgboost4j-spark_<scala version>-<xgboost version>.jar```.<br>
-The relevant configurations are in ```./sparkbench/ml/pom.xml```
-```
-    <dependency>
-      <groupId>ml.dmlc</groupId>
-      <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-      <version>1.1.0</version>
-    </dependency>
-    <dependency>
-      <groupId>ml.dmlc</groupId>
-      <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-      <version>1.1.0</version>
-    </dependency>
-```
-and ```./pom.xml```
-```
-    <repository>
-      <id>xgboostrepo</id>
-      <name>XGBoost Maven Repo</name>
-      <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release</url>
-      <releases>
-        <enabled>true</enabled>
-      </releases>
-      <snapshots>
-        <enabled>false</enabled>
-      </snapshots>
-    </repository>
-```
-
-#### 8.a latest xgboost release (default) ####
-
-By default, the hibench xgboost benchmark is configured to use the latest xgboost release from https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release.<br>
-To use it, simply build hibench, prepare data and run xgboost benchmark. For example,
-```
-$ mvn -Psparkbench -Dmodules -Pml -Dspark=2.4 -Dscala=2.12 clean package
-$ bin/workloads/ml/xgboost/prepare/prepare.sh && hdfs dfs -du -s -h /HiBench/XGBoost/Input
-$ bin/workloads/ml/xgboost/spark/run.sh
-```
-
-#### 8.b other xgboost releases ####
-
-To use other xgboost releases, change the xgboost versions for xgboost4j and xgboost4j-spark to the target versions in ```./sparkbench/ml/pom.xml```. The ```scala.binary.version``` can be specified by command line parameter ```-Dscala```.<br>
-e.g. to use xgboost v1.0.0, change ```<version>1.1.0</version>``` to ```<version>1.0.0</version>``` for both xgboost4j and xgboost4j-spark.<br>
-If the xgboost release is from other maven repo, update the xgboostrepo url in ```./pom.xml``` as well.<br>
-After that, build hibench, prepare data and run xgboost benchmark.
-
-#### 8.c xgboost jar files ####
-
-If you only have the xgboost jar files, just copy them to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in sparkbench/ml/pom.xml to get aligned.<br>
-For example, if xgboost is built from source on a Linux platform, the jars will be generated and installed to ```~/.m2/repository/ml/dmlc/xgboost4j_<scala version>/<xgboost version>-SNAPSHOT/``` and ```~/.m2/repository/ml/dmlc/xgboost4j-spark_<scala version>/<xgboost version>-SNAPSHOT/``` respectively. To use them, copy the 2 jars to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in the pom.xml files.<br>
-After that, build hibench, prepare data and run xgboost benchmark.
-

From b92c8ccfa6ec602520179f7bf51b89db970d67a5 Mon Sep 17 00:00:00 2001
From: bobjiang82 <bo.b.jiang@intel.com>
Date: Fri, 14 Aug 2020 18:06:06 +0800
Subject: [PATCH 10/11] Update benchmarks.lst

---
 conf/benchmarks.lst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conf/benchmarks.lst b/conf/benchmarks.lst
index 3ef7717b5..4d62ba3a4 100644
--- a/conf/benchmarks.lst
+++ b/conf/benchmarks.lst
@@ -24,5 +24,6 @@ ml.linear
 ml.lda
 ml.svm
 ml.gmm
+ml.xgboost
 
-graph.nweight
\ No newline at end of file
+graph.nweight

From 7902f59f5e1479ded362981a4e6897ce1a2bd5ea Mon Sep 17 00:00:00 2001
From: bobjiang82 <bo.b.jiang@intel.com>
Date: Fri, 14 Aug 2020 18:06:35 +0800
Subject: [PATCH 11/11] Update benchmarks_ml.lst

---
 travis/benchmarks_ml.lst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/travis/benchmarks_ml.lst b/travis/benchmarks_ml.lst
index 6e4894a5a..362df64d4 100644
--- a/travis/benchmarks_ml.lst
+++ b/travis/benchmarks_ml.lst
@@ -9,3 +9,4 @@ ml.linear
 ml.lda
 ml.svm
 ml.gmm
+ml.xgboost