From 45b71a9a2db4de8fb1f31b8552ab07ad92ee8737 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 15 Jul 2020 15:21:02 +0800 Subject: [PATCH 01/11] Add XGBoost --- bin/functions/hibench_prop_env_mapping.py | 8 ++ bin/workloads/ml/xgboost/prepare/prepare.sh | 35 +++++ bin/workloads/ml/xgboost/spark/run.sh | 34 +++++ conf/workloads/ml/xgboost.conf | 26 ++++ sparkbench/ml/pom.xml | 10 ++ .../com/intel/sparkbench/ml/XGBoost.scala | 132 ++++++++++++++++++ 6 files changed, 245 insertions(+) create mode 100755 bin/workloads/ml/xgboost/prepare/prepare.sh create mode 100755 bin/workloads/ml/xgboost/spark/run.sh create mode 100644 conf/workloads/ml/xgboost.conf create mode 100644 sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py index 272faac2e..5e7f04f44 100644 --- a/bin/functions/hibench_prop_env_mapping.py +++ b/bin/functions/hibench_prop_env_mapping.py @@ -127,6 +127,14 @@ MAX_BINS_GBT="hibench.gbt.maxBins", NUM_ITERATIONS_GBT="hibench.gbt.numIterations", LEARNING_RATE_GBT="hibench.gbt.learningRate", + # For XGBoost + NUM_EXAMPLES_XGBOOST="hibench.xgboost.examples", + NUM_FEATURES_XGBOOST="hibench.xgboost.features", + NUM_CLASSES_XGBOOST="hibench.xgboost.numClasses", + MAX_DEPTH_XGBOOST="hibench.xgboost.maxDepth", + MAX_BINS_XGBOOST="hibench.xgboost.maxBins", + NUM_ITERATIONS_XGBOOST="hibench.xgboost.numIterations", + LEARNING_RATE_XGBOOST="hibench.xgboost.learningRate", # For Random Forest NUM_EXAMPLES_RF="hibench.rf.examples", NUM_FEATURES_RF="hibench.rf.features", diff --git a/bin/workloads/ml/xgboost/prepare/prepare.sh b/bin/workloads/ml/xgboost/prepare/prepare.sh new file mode 100755 index 000000000..9506ef163 --- /dev/null +++ b/bin/workloads/ml/xgboost/prepare/prepare.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +current_dir=`dirname "$0"` +current_dir=`cd "$current_dir"; pwd` +root_dir=${current_dir}/../../../../../ +workload_config=${root_dir}/conf/workloads/ml/xgboost.conf +. "${root_dir}/bin/functions/load_bench_config.sh" + +enter_bench XGBoostDataPrepare ${workload_config} ${current_dir} +show_bannar start + +rmr_hdfs $INPUT_HDFS || true +START_TIME=`timestamp` + +run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostedTreeDataGenerator $INPUT_HDFS $NUM_EXAMPLES_XGBOOST $NUM_FEATURES_XGBOOST + +END_TIME=`timestamp` + +show_bannar finish +leave_bench + diff --git a/bin/workloads/ml/xgboost/spark/run.sh b/bin/workloads/ml/xgboost/spark/run.sh new file mode 100755 index 000000000..76d9c2caf --- /dev/null +++ b/bin/workloads/ml/xgboost/spark/run.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +current_dir=`dirname "$0"` +current_dir=`cd "$current_dir"; pwd` +root_dir=${current_dir}/../../../../../ +workload_config=${root_dir}/conf/workloads/ml/xgboost.conf +. "${root_dir}/bin/functions/load_bench_config.sh" + +enter_bench XGBoost ${workload_config} ${current_dir} +show_bannar start + +rmr_hdfs $OUTPUT_HDFS || true + +SIZE=`dir_size $INPUT_HDFS` +START_TIME=`timestamp` +run_spark_job com.intel.hibench.sparkbench.ml.XGBoost --numClasses $NUM_CLASSES_XGBOOST --maxDepth $MAX_DEPTH_XGBOOST --maxBins $MAX_BINS_XGBOOST --numIterations $NUM_ITERATIONS_XGBOOST --learningRate $LEARNING_RATE_XGBOOST $INPUT_HDFS +END_TIME=`timestamp` + +gen_report ${START_TIME} ${END_TIME} ${SIZE} +show_bannar finish +leave_bench diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf new file mode 100644 index 000000000..7788f6242 --- /dev/null +++ b/conf/workloads/ml/xgboost.conf @@ -0,0 +1,26 @@ +hibench.xgboost.tiny.examples 10 +hibench.xgboost.tiny.features 100 +hibench.xgboost.small.examples 100 +hibench.xgboost.small.features 500 +hibench.xgboost.large.examples 1000 +hibench.xgboost.large.features 2000 +hibench.xgboost.huge.examples 1000 +hibench.xgboost.huge.features 4000 +hibench.xgboost.gigantic.examples 1000 +hibench.xgboost.gigantic.features 8000 +hibench.xgboost.bigdata.examples 1000 +hibench.xgboost.bigdata.features 12000 + + +hibench.xgboost.examples ${hibench.xgboost.${hibench.scale.profile}.examples} +hibench.xgboost.features ${hibench.xgboost.${hibench.scale.profile}.features} +hibench.xgboost.partitions ${hibench.default.map.parallelism} + +hibench.xgboost.numClasses 2 +hibench.xgboost.maxDepth 30 +hibench.xgboost.maxBins 32 +hibench.xgboost.numIterations 20 +hibench.xgboost.learningRate 0.1 + +hibench.workload.input ${hibench.hdfs.data.dir}/XGBoost/Input +hibench.workload.output ${hibench.hdfs.data.dir}/XGBoost/Output diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml index d57898a56..44ae66cc9 100644 --- a/sparkbench/ml/pom.xml +++ b/sparkbench/ml/pom.xml @@ -53,5 +53,15 @@ mahout-math ${mahout.version} + + ml.dmlc + xgboost4j_${scala.binary.version} + 1.0.0 + + + ml.dmlc + xgboost4j-spark_${scala.binary.version} + 1.0.0 + diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala new file mode 100644 index 000000000..349b34b9a --- /dev/null +++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.hibench.sparkbench.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.mllib.tree.GradientBoostedTrees +import org.apache.spark.mllib.tree.configuration.BoostingStrategy +import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} +import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.sql.SparkSession +import scopt.OptionParser + +object XGBoost { + + case class Params( + numClasses: Int = 2, + maxDepth: Int = 30, + maxBins: Int = 32, + numIterations: Int = 20, + learningRate: Double = 0.1, + dataPath: String = null + ) + + def main(args: Array[String]): Unit = { + val defaultParams = Params() + + val parser = new OptionParser[Params]("XGBoost"){ + head("XGBoost: use XGBoost for classification") + opt[Int]("numClasses") + .text(s"numClasses, default: ${defaultParams.numClasses}") + .action((x,c) => c.copy(numClasses = x)) + opt[Int]("maxDepth") + .text(s"maxDepth, default: ${defaultParams.maxDepth}") + .action((x,c) => c.copy(maxDepth = x)) + opt[Int]("maxBins") + .text(s"maxBins, default: ${defaultParams.maxBins}") + .action((x,c) => c.copy(maxBins = x)) + opt[Int]("numIterations") + .text(s"numIterations, default: ${defaultParams.numIterations}") + .action((x,c) => c.copy(numIterations = x)) + opt[Double]("learningRate") + .text(s"learningRate, default: ${defaultParams.learningRate}") + .action((x,c) => c.copy(learningRate = x)) + arg[String]("") + .required() + .text("data path for XGBoost") + .action((x,c) => c.copy(dataPath = x)) + } + parser.parse(args, defaultParams) match { + case Some(params) => run(params) + case _ => sys.exit(1) + } + } + + def run(params: Params): Unit = { + + val spark = SparkSession + .builder + .appName(s"XGBoost with $params") + .getOrCreate() + + val sc = spark.sparkContext + + import spark.implicits._ + + val dataPath = params.dataPath + val numClasses = params.numClasses + val maxDepth = params.maxDepth + val maxBins = params.maxBins + val numIterations = params.numIterations + val learningRate = params.learningRate + + // Load data file. + val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath) + // Convert to ML LabeledPoint and to DataFrame + val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) } + val data = mlRDD.toDF + + // Split the data into training and test sets (30% held out for testing) + val splits = data.randomSplit(Array(0.7, 0.3)) + val (trainingData, testData) = (splits(0), splits(1)) + + val numWorkers = sc.getConf.getInt("spark.executor.instances", 2) + val numThreads = sc.getConf.getInt("spark.executor.cores", 1) + + val xgbParam = Map("eta" -> learningRate, + "num_round" -> numIterations, + "eta" -> learningRate, + "num_class" -> numClasses, + "max_depth" -> maxDepth, + "max_bin" -> maxBins, + "objective" -> "multi:softprob" + ) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("label") + + val model = xgbClassifier.fit(trainingData) + + // Make predictions. + val predictions = model.transform(testData) + + // Select (prediction, true label) and compute test error. + val evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") + .setMetricName("accuracy") + val accuracy = evaluator.evaluate(predictions) + println(s"Test Error = ${1.0 - accuracy}") + + sc.stop() + } +} From b81f847435778902fa71d45d3ee1a3b74546d844 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 15 Jul 2020 15:23:54 +0800 Subject: [PATCH 02/11] Clean up code --- .../ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala | 3 --- 1 file changed, 3 deletions(-) diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala index 349b34b9a..99e7e9115 100644 --- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala +++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala @@ -99,9 +99,6 @@ object XGBoost { val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) - val numWorkers = sc.getConf.getInt("spark.executor.instances", 2) - val numThreads = sc.getConf.getInt("spark.executor.cores", 1) - val xgbParam = Map("eta" -> learningRate, "num_round" -> numIterations, "eta" -> learningRate, From 6210f925593718800133914e6c5e217c0123307e Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 15 Jul 2020 16:32:10 +0800 Subject: [PATCH 03/11] Add num_workers and nthread params and check spark.task.cpus --- .../scala/com/intel/sparkbench/ml/XGBoost.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala index 99e7e9115..9f68e5abd 100644 --- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala +++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala @@ -99,13 +99,25 @@ object XGBoost { val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) + val numWorkers = sc.getConf.getInt("spark.executor.instances", -1) + val numThreads = sc.getConf.getInt("spark.executor.cores", -1) + val taskCPUs = sc.getConf.getInt("spark.task.cpus", -1) + + if (numWorkers == -1 || numThreads == -1 || taskCPUs == -1) { + println("XGBoost error: should set spark.executor.instances, " + + "spark.executor.cores and spark.task.cpus in Spark Config") + sys.exit(1) + } + val xgbParam = Map("eta" -> learningRate, "num_round" -> numIterations, "eta" -> learningRate, "num_class" -> numClasses, "max_depth" -> maxDepth, "max_bin" -> maxBins, - "objective" -> "multi:softprob" + "objective" -> "multi:softprob", + "num_workers" -> numWorkers, + "nthread" -> numThreads ) val xgbClassifier = new XGBoostClassifier(xgbParam). setFeaturesCol("features"). From 85851a6433ad3ada273d81d9c582d0b362626c02 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Fri, 17 Jul 2020 12:05:43 +0800 Subject: [PATCH 04/11] modify README --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 8217280bf..222870e88 100644 --- a/README.md +++ b/README.md @@ -74,27 +74,31 @@ There are totally 27 workloads in HiBench. The workloads are divided into 6 cate Gradient-boosted trees (GBT) is a popular regression method using ensembles of decision trees. This workload is implemented in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator. -6. Linear Regression (Linear) +6. XGBoost (XGB) + + XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. This workload is implemented with XGBoost4J-Spark API in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator. + +7. Linear Regression (Linear) Linear Regression (Linear) is a workload that implemented in spark.ml with ElasticNet. The input data set is generated by LinearRegressionDataGenerator. -7. Latent Dirichlet Allocation (LDA) +8. Latent Dirichlet Allocation (LDA) Latent Dirichlet allocation (LDA) is a topic model which infers topics from a collection of text documents. This workload is implemented in spark.mllib and the input data set is generated by LDADataGenerator. -8. Principal Components Analysis (PCA) +9. Principal Components Analysis (PCA) Principal component analysis (PCA) is a statistical method to find a rotation such that the first coordinate has the largest variance possible, and each succeeding coordinate in turn has the largest variance possible. PCA is used widely in dimensionality reduction. This workload is implemented in spark.ml. The input data set is generated by PCADataGenerator. -9. Random Forest (RF) +10. Random Forest (RF) Random forests (RF) are ensembles of decision trees. Random forests are one of the most successful machine learning models for classification and regression. They combine many decision trees in order to reduce the risk of overfitting. This workload is implemented in spark.mllib and the input data set is generated by RandomForestDataGenerator. -10. Support Vector Machine (SVM) +11. Support Vector Machine (SVM) Support Vector Machine (SVM) is a standard method for large-scale classification tasks. This workload is implemented in spark.mllib and the input data set is generated by SVMDataGenerator. -11. Singular Value Decomposition (SVD) +12. Singular Value Decomposition (SVD) Singular value decomposition (SVD) factorizes a matrix into three matrices. This workload is implemented in spark.mllib and its input data set is generated by SVDDataGenerator. From 93b663bd6e55479eb403f3e908d854c815c29b0b Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Fri, 17 Jul 2020 21:59:24 +0800 Subject: [PATCH 05/11] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 222870e88..ec4d114d3 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ There are totally 27 workloads in HiBench. The workloads are divided into 6 cate Gradient-boosted trees (GBT) is a popular regression method using ensembles of decision trees. This workload is implemented in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator. -6. XGBoost (XGB) +6. XGBoost (XGBoost) XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. This workload is implemented with XGBoost4J-Spark API in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator. From 31c3b5993e78c5bc5972ccedd7daaf6f7caa98c3 Mon Sep 17 00:00:00 2001 From: "Jiang, Bo" Date: Thu, 30 Jul 2020 22:32:53 +0800 Subject: [PATCH 06/11] update xgboost configuration to use v1.1.0-scala2.12 by default --- conf/hibench.conf | 2 +- conf/workloads/ml/xgboost.conf | 2 +- docs/run-sparkbench.md | 56 ++++++++++++++++++++++++++++++++++ pom.xml | 11 +++++++ sparkbench/ml/pom.xml | 4 +-- 5 files changed, 71 insertions(+), 4 deletions(-) diff --git a/conf/hibench.conf b/conf/hibench.conf index b825997c0..867ec9eb5 100644 --- a/conf/hibench.conf +++ b/conf/hibench.conf @@ -1,6 +1,6 @@ # Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata. # The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf -hibench.scale.profile tiny +hibench.scale.profile large # Mapper number in hadoop, partition number in Spark hibench.default.map.parallelism 8 diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf index 7788f6242..9c04e6da6 100644 --- a/conf/workloads/ml/xgboost.conf +++ b/conf/workloads/ml/xgboost.conf @@ -17,7 +17,7 @@ hibench.xgboost.features ${hibench.xgboost.${hibench.scale.pr hibench.xgboost.partitions ${hibench.default.map.parallelism} hibench.xgboost.numClasses 2 -hibench.xgboost.maxDepth 30 +hibench.xgboost.maxDepth 8 hibench.xgboost.maxBins 32 hibench.xgboost.numIterations 20 hibench.xgboost.learningRate 0.1 diff --git a/docs/run-sparkbench.md b/docs/run-sparkbench.md index 459bb3e94..3cf9cf676 100644 --- a/docs/run-sparkbench.md +++ b/docs/run-sparkbench.md @@ -88,3 +88,59 @@ hibench.yarn.executor.num | Spark executor number in Yarn mode hibench.yarn.executor.cores | Spark executor cores in Yarn mode spark.executor.memory | Spark executor memory spark.driver.memory | Spark driver memory + + +### 8. Run xgboost workload ### + +Hibench xgboost benchmark depends on the xgboost libraries to build and run. The libs are ```xgboost4j_-.jar``` and ```xgboost4j-spark_-.jar```.
+The relevant configurations are in ```./sparkbench/ml/pom.xml``` +``` + + ml.dmlc + xgboost4j_${scala.binary.version} + 1.1.0 + + + ml.dmlc + xgboost4j-spark_${scala.binary.version} + 1.1.0 + +``` +and ```./pom.xml``` +``` + + xgboostrepo + XGBoost Maven Repo + https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release + + true + + + false + + +``` + +#### 8.a latest xgboost release (default) #### + +By default, the hibench xgboost benchmark is configured to use the latest xgboost release from https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release.
+To use it, simply build hibench, prepare data and run xgboost benchmark. For example, +``` +$ mvn -Psparkbench -Dmodules -Pml -Dspark=2.4 -Dscala=2.12 clean package +$ bin/workloads/ml/xgboost/prepare/prepare.sh && hdfs dfs -du -s -h /HiBench/XGBoost/Input +$ bin/workloads/ml/xgboost/spark/run.sh +``` + +#### 8.b other xgboost releases #### + +To use other xgboost releases, change the xgboost versions for xgboost4j and xgboost4j-spark to the target versions in ```./sparkbench/ml/pom.xml```. The ```scala.binary.version``` can be specified by command line parameter ```-Dscala```.
+e.g. to use xgboost v1.0.0, change ```1.1.0``` to ```1.0.0``` for both xgboost4j and xgboost4j-spark.
+If the xgboost release is from other maven repo, update the xgboostrepo url in ```./pom.xml``` as well.
+After that, build hibench, prepare data and run xgboost benchmark. + +#### 8.c xgboost jar files #### + +If you only have the xgboost jar files, just copy them to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in sparkbench/ml/pom.xml to get aligned.
+For example, if xgboost is built from source on a Linux platform, the jars will be generated and installed to ```~/.m2/repository/ml/dmlc/xgboost4j_/-SNAPSHOT/``` and ```~/.m2/repository/ml/dmlc/xgboost4j-spark_/-SNAPSHOT/``` respectively. To use them, copy the 2 jars to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in the pom.xml files.
+After that, build hibench, prepare data and run xgboost benchmark. + diff --git a/pom.xml b/pom.xml index f93d602c9..a80101570 100644 --- a/pom.xml +++ b/pom.xml @@ -80,6 +80,17 @@ Scala-tools Maven 2 Repository https://oss.sonatype.org/content/groups/scala-tools/ + + xgboostrepo + XGBoost Maven Repo + https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release + + true + + + false + + diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml index 44ae66cc9..32075c625 100644 --- a/sparkbench/ml/pom.xml +++ b/sparkbench/ml/pom.xml @@ -56,12 +56,12 @@ ml.dmlc xgboost4j_${scala.binary.version} - 1.0.0 + 1.1.0 ml.dmlc xgboost4j-spark_${scala.binary.version} - 1.0.0 + 1.1.0 From 5791a0fec320be597b89ffa4a2e495e399b524f0 Mon Sep 17 00:00:00 2001 From: "Jiang, Bo" Date: Thu, 30 Jul 2020 23:37:09 +0800 Subject: [PATCH 07/11] refine XGBoost.scala, use pipeline --- .../scala/com/intel/sparkbench/ml/XGBoost.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala index 9f68e5abd..f497a1613 100644 --- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala +++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala @@ -18,11 +18,12 @@ package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.mllib.tree.GradientBoostedTrees -import org.apache.spark.mllib.tree.configuration.BoostingStrategy -import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel +// import org.apache.spark.mllib.tree.GradientBoostedTrees +// import org.apache.spark.mllib.tree.configuration.BoostingStrategy +// import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator @@ -33,7 +34,7 @@ object XGBoost { case class Params( numClasses: Int = 2, - maxDepth: Int = 30, + maxDepth: Int = 8, maxBins: Int = 32, numIterations: Int = 20, learningRate: Double = 0.1, @@ -93,7 +94,7 @@ object XGBoost { val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath) // Convert to ML LabeledPoint and to DataFrame val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) } - val data = mlRDD.toDF + val data = mlRDD.toDF("label", "features") // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) @@ -123,7 +124,9 @@ object XGBoost { setFeaturesCol("features"). setLabelCol("label") - val model = xgbClassifier.fit(trainingData) + val pipeline = new Pipeline().setStages(Array(xgbClassifier)) + + val model = pipeline.fit(trainingData) // Make predictions. val predictions = model.transform(testData) From 38fc921b90a89296cfbed56da59af90e0b0103a2 Mon Sep 17 00:00:00 2001 From: bobjiang82 Date: Fri, 7 Aug 2020 09:28:14 +0800 Subject: [PATCH 08/11] Update hibench.conf --- conf/hibench.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/hibench.conf b/conf/hibench.conf index 867ec9eb5..b825997c0 100644 --- a/conf/hibench.conf +++ b/conf/hibench.conf @@ -1,6 +1,6 @@ # Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata. # The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf -hibench.scale.profile large +hibench.scale.profile tiny # Mapper number in hadoop, partition number in Spark hibench.default.map.parallelism 8 From a213d7536f4ce7323959ac97df13fee0d094ec50 Mon Sep 17 00:00:00 2001 From: bobjiang82 Date: Fri, 7 Aug 2020 10:46:00 +0800 Subject: [PATCH 09/11] Update run-sparkbench.md commit code first and continue to refine doc. --- docs/run-sparkbench.md | 56 ------------------------------------------ 1 file changed, 56 deletions(-) diff --git a/docs/run-sparkbench.md b/docs/run-sparkbench.md index 3cf9cf676..459bb3e94 100644 --- a/docs/run-sparkbench.md +++ b/docs/run-sparkbench.md @@ -88,59 +88,3 @@ hibench.yarn.executor.num | Spark executor number in Yarn mode hibench.yarn.executor.cores | Spark executor cores in Yarn mode spark.executor.memory | Spark executor memory spark.driver.memory | Spark driver memory - - -### 8. Run xgboost workload ### - -Hibench xgboost benchmark depends on the xgboost libraries to build and run. The libs are ```xgboost4j_-.jar``` and ```xgboost4j-spark_-.jar```.
-The relevant configurations are in ```./sparkbench/ml/pom.xml``` -``` - - ml.dmlc - xgboost4j_${scala.binary.version} - 1.1.0 - - - ml.dmlc - xgboost4j-spark_${scala.binary.version} - 1.1.0 - -``` -and ```./pom.xml``` -``` - - xgboostrepo - XGBoost Maven Repo - https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release - - true - - - false - - -``` - -#### 8.a latest xgboost release (default) #### - -By default, the hibench xgboost benchmark is configured to use the latest xgboost release from https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release.
-To use it, simply build hibench, prepare data and run xgboost benchmark. For example, -``` -$ mvn -Psparkbench -Dmodules -Pml -Dspark=2.4 -Dscala=2.12 clean package -$ bin/workloads/ml/xgboost/prepare/prepare.sh && hdfs dfs -du -s -h /HiBench/XGBoost/Input -$ bin/workloads/ml/xgboost/spark/run.sh -``` - -#### 8.b other xgboost releases #### - -To use other xgboost releases, change the xgboost versions for xgboost4j and xgboost4j-spark to the target versions in ```./sparkbench/ml/pom.xml```. The ```scala.binary.version``` can be specified by command line parameter ```-Dscala```.
-e.g. to use xgboost v1.0.0, change ```1.1.0``` to ```1.0.0``` for both xgboost4j and xgboost4j-spark.
-If the xgboost release is from other maven repo, update the xgboostrepo url in ```./pom.xml``` as well.
-After that, build hibench, prepare data and run xgboost benchmark. - -#### 8.c xgboost jar files #### - -If you only have the xgboost jar files, just copy them to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in sparkbench/ml/pom.xml to get aligned.
-For example, if xgboost is built from source on a Linux platform, the jars will be generated and installed to ```~/.m2/repository/ml/dmlc/xgboost4j_/-SNAPSHOT/``` and ```~/.m2/repository/ml/dmlc/xgboost4j-spark_/-SNAPSHOT/``` respectively. To use them, copy the 2 jars to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in the pom.xml files.
-After that, build hibench, prepare data and run xgboost benchmark. - From b92c8ccfa6ec602520179f7bf51b89db970d67a5 Mon Sep 17 00:00:00 2001 From: bobjiang82 Date: Fri, 14 Aug 2020 18:06:06 +0800 Subject: [PATCH 10/11] Update benchmarks.lst --- conf/benchmarks.lst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/benchmarks.lst b/conf/benchmarks.lst index 3ef7717b5..4d62ba3a4 100644 --- a/conf/benchmarks.lst +++ b/conf/benchmarks.lst @@ -24,5 +24,6 @@ ml.linear ml.lda ml.svm ml.gmm +ml.xgboost -graph.nweight \ No newline at end of file +graph.nweight From 7902f59f5e1479ded362981a4e6897ce1a2bd5ea Mon Sep 17 00:00:00 2001 From: bobjiang82 Date: Fri, 14 Aug 2020 18:06:35 +0800 Subject: [PATCH 11/11] Update benchmarks_ml.lst --- travis/benchmarks_ml.lst | 1 + 1 file changed, 1 insertion(+) diff --git a/travis/benchmarks_ml.lst b/travis/benchmarks_ml.lst index 6e4894a5a..362df64d4 100644 --- a/travis/benchmarks_ml.lst +++ b/travis/benchmarks_ml.lst @@ -9,3 +9,4 @@ ml.linear ml.lda ml.svm ml.gmm +ml.xgboost