package com.immooc.spark
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.util.MLUtils
object DecisionTreeTest {
def main(args:Array[String]): Unit = {
val conf = new SparkConf().setAppName("DecisionTreeTest").setMaster("local[2]")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
// 读取样本数据1,格式为LIBSVM format
val data = sc.textFile("file:///Users/walle/Documents/D3/sparkmlib/data.txt")
val parsedData = data.map{ line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
}
//样本数据划分训练样本与测试样本
val splits = parsedData.randomSplit(Array(0.7, 0.3), seed = 11L)
val training = splits(0).cache()
val test = splits(1)
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "gini"
val maxDepth = 5
val maxBins = 32
val model = DecisionTree.trainClassifier(training, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins)
//模型预测
val labelAndPreds = test.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
//测试值与真实值对比
val print_predict = labelAndPreds.take(15)
println("label" + "\t" + "prediction")
for (i <- 0 to print_predict.length - 1) {
println(print_predict(i)._1 + "\t" + print_predict(i)._2)
}
//树的错误率
val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / test.count()
println("Test Error = " + testErr)
//打印树的判断值
println("Learned classification tree model:\n" + model.toDebugString)
}
}
1. 数据
0,32 1 1 0 0,25 1 2 0 1,29 1 2 1 1,24 1 1 0 0,31 1 1 0 1,35 1 2 1 0,30 0 1 0 0,31 1 1 0 1,30 1 2 1 1,21 1 1 0 0,21 1 2 0 1,21 1 2 1 0,29 0 2 1 0,29 1 0 1 0,29 0 2 1 1,30 1 1 0
2. 结果
label prediction
1.0 1.0
1.0 1.0
1.0 0.0
0.0 1.0
0.0 0.0
Test Error = 0.4
Learned classification tree model:
DecisionTreeModel classifier of depth 5 with 11 nodes
If (feature 0 <= 33.5)
If (feature 0 <= 30.5)
If (feature 1 <= 0.5)
Predict: 0.0
Else (feature 1 > 0.5)
If (feature 0 <= 27.0)
If (feature 2 <= 1.5)
Predict: 1.0
Else (feature 2 > 1.5)
Predict: 0.0
Else (feature 0 > 27.0)
Predict: 1.0
Else (feature 0 > 30.5)
Predict: 0.0
Else (feature 0 > 33.5)
Predict: 1.0
4691以上所述就是小编给大家介绍的《Spark mllib 决策树》,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对 码农网 的支持!
猜你喜欢:本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。
Java并发编程实战
Brian Goetz、Tim Peierls、Joshua Bloch、Joseph Bowbeer、David Holmes、Doug Lea / 童云兰 / 机械工业出版社华章公司 / 2012-2 / 69.00元
本书深入浅出地介绍了Java线程和并发,是一本完美的Java并发参考手册。书中从并发性和线程安全性的基本概念出发,介绍了如何使用类库提供的基本并发构建块,用于避免并发危险、构造线程安全的类及验证线程安全的规则,如何将小的线程安全类组合成更大的线程安全类,如何利用线程来提高并发应用程序的吞吐量,如何识别可并行执行的任务,如何提高单线程子系统的响应性,如何确保并发程序执行预期任务,如何提高并发代码的性......一起来看看 《Java并发编程实战》 这本书的介绍吧!