## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#importsysimportrandomfrompysparkimportRDD,sincefrompyspark.mllib.commonimportcallMLlibFunc,inherit_doc,JavaModelWrapperfrompyspark.mllib.linalgimport_convert_to_vectorfrompyspark.mllib.regressionimportLabeledPointfrompyspark.mllib.utilimportJavaLoader,JavaSaveablefromtypingimportDict,Optional,Tuple,Union,overload,TYPE_CHECKINGfrompyspark.core.rddimportRDDifTYPE_CHECKING:frompyspark.mllib._typingimportVectorLike__all__=["DecisionTreeModel","DecisionTree","RandomForestModel","RandomForest","GradientBoostedTreesModel","GradientBoostedTrees",]classTreeEnsembleModel(JavaModelWrapper,JavaSaveable):"""TreeEnsembleModel .. versionadded:: 1.3.0 """@overloaddefpredict(self,x:"VectorLike")->float:...@overloaddefpredict(self,x:RDD["VectorLike"])->RDD[float]:...defpredict(self,x:Union["VectorLike",RDD["VectorLike"]])->Union[float,RDD[float]]:""" Predict values for a single data point or an RDD of points using the model trained. .. versionadded:: 1.3.0 Notes ----- In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. """ifisinstance(x,RDD):returnself.call("predict",x.map(_convert_to_vector))else:returnself.call("predict",_convert_to_vector(x))@since("1.3.0")defnumTrees(self)->int:""" Get number of trees in ensemble. """returnself.call("numTrees")@since("1.3.0")deftotalNumNodes(self)->int:""" Get total number of nodes, summed over all trees in the ensemble. """returnself.call("totalNumNodes")def__repr__(self)->str:"""Summary of model"""returnself._java_model.toString()@since("1.3.0")deftoDebugString(self)->str:"""Full model"""returnself._java_model.toDebugString()
[docs]classDecisionTreeModel(JavaModelWrapper,JavaSaveable,JavaLoader["DecisionTreeModel"]):""" A decision tree model for classification or regression. .. versionadded:: 1.1.0 """@overloaddefpredict(self,x:"VectorLike")->float:...@overloaddefpredict(self,x:RDD["VectorLike"])->RDD[float]:...
[docs]defpredict(self,x:Union["VectorLike",RDD["VectorLike"]])->Union[float,RDD[float]]:""" Predict the label of one or more examples. .. versionadded:: 1.1.0 Parameters ---------- x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` Data point (feature vector), or an RDD of data points (feature vectors). Notes ----- In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. """ifisinstance(x,RDD):returnself.call("predict",x.map(_convert_to_vector))else:returnself.call("predict",_convert_to_vector(x))
[docs]@since("1.1.0")defnumNodes(self)->int:"""Get number of nodes in tree, including leaf nodes."""returnself._java_model.numNodes()
[docs]@since("1.1.0")defdepth(self)->int:""" Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). """returnself._java_model.depth()
def__repr__(self)->str:"""summary of model."""returnself._java_model.toString()
[docs]classDecisionTree:""" Learning algorithm for a decision tree model for classification or regression. .. versionadded:: 1.1.0 """@classmethoddef_train(cls,data:RDD[LabeledPoint],type:str,numClasses:int,features:Dict[int,int],impurity:str="gini",maxDepth:int=5,maxBins:int=32,minInstancesPerNode:int=1,minInfoGain:float=0.0,)->DecisionTreeModel:first=data.first()assertisinstance(first,LabeledPoint),"the data should be RDD of LabeledPoint"model=callMLlibFunc("trainDecisionTreeModel",data,type,numClasses,features,impurity,maxDepth,maxBins,minInstancesPerNode,minInfoGain,)returnDecisionTreeModel(model)
[docs]@classmethoddeftrainClassifier(cls,data:RDD[LabeledPoint],numClasses:int,categoricalFeaturesInfo:Dict[int,int],impurity:str="gini",maxDepth:int=5,maxBins:int=32,minInstancesPerNode:int=1,minInfoGain:float=0.0,)->DecisionTreeModel:""" Train a decision tree model for classification. .. versionadded:: 1.1.0 Parameters ---------- data : :py:class:`pyspark.RDD` Training data: RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. numClasses : int Number of classes for classification. categoricalFeaturesInfo : dict Map storing arity of categorical features. An entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. impurity : str, optional Criterion used for information gain calculation. Supported values: "gini" or "entropy". (default: "gini") maxDepth : int, optional Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). (default: 5) maxBins : int, optional Number of bins used for finding splits at each node. (default: 32) minInstancesPerNode : int, optional Minimum number of instances required at child nodes to create the parent split. (default: 1) minInfoGain : float, optional Minimum info gain required to create a split. (default: 0.0) Returns ------- :py:class:`DecisionTreeModel` Examples -------- >>> from numpy import array >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> >>> data = [ ... LabeledPoint(0.0, [0.0]), ... LabeledPoint(1.0, [1.0]), ... LabeledPoint(1.0, [2.0]), ... LabeledPoint(1.0, [3.0]) ... ] >>> model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {}) >>> print(model) DecisionTreeModel classifier of depth 1 with 3 nodes >>> print(model.toDebugString()) DecisionTreeModel classifier of depth 1 with 3 nodes If (feature 0 <= 0.5) Predict: 0.0 Else (feature 0 > 0.5) Predict: 1.0 >>> model.predict(array([1.0])) 1.0 >>> model.predict(array([0.0])) 0.0 >>> rdd = sc.parallelize([[1.0], [0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """returncls._train(data,"classification",numClasses,categoricalFeaturesInfo,impurity,maxDepth,maxBins,minInstancesPerNode,minInfoGain,)
[docs]@classmethod@since("1.1.0")deftrainRegressor(cls,data:RDD[LabeledPoint],categoricalFeaturesInfo:Dict[int,int],impurity:str="variance",maxDepth:int=5,maxBins:int=32,minInstancesPerNode:int=1,minInfoGain:float=0.0,)->DecisionTreeModel:""" Train a decision tree model for regression. Parameters ---------- data : :py:class:`pyspark.RDD` Training data: RDD of LabeledPoint. Labels are real numbers. categoricalFeaturesInfo : dict Map storing arity of categorical features. An entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. impurity : str, optional Criterion used for information gain calculation. The only supported value for regression is "variance". (default: "variance") maxDepth : int, optional Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). (default: 5) maxBins : int, optional Number of bins used for finding splits at each node. (default: 32) minInstancesPerNode : int, optional Minimum number of instances required at child nodes to create the parent split. (default: 1) minInfoGain : float, optional Minimum info gain required to create a split. (default: 0.0) Returns ------- :py:class:`DecisionTreeModel` Examples -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> from pyspark.mllib.linalg import SparseVector >>> >>> sparse_data = [ ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ... ] >>> >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data), {}) >>> model.predict(SparseVector(2, {1: 1.0})) 1.0 >>> model.predict(SparseVector(2, {1: 0.0})) 0.0 >>> rdd = sc.parallelize([[0.0, 1.0], [0.0, 0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """returncls._train(data,"regression",0,categoricalFeaturesInfo,impurity,maxDepth,maxBins,minInstancesPerNode,minInfoGain,)
[docs]@inherit_docclassRandomForestModel(TreeEnsembleModel,JavaLoader["RandomForestModel"]):""" Represents a random forest model. .. versionadded:: 1.2.0 """@classmethoddef_java_loader_class(cls)->str:return"org.apache.spark.mllib.tree.model.RandomForestModel"
[docs]classRandomForest:""" Learning algorithm for a random forest model for classification or regression. .. versionadded:: 1.2.0 """supportedFeatureSubsetStrategies:Tuple[str,...]=("auto","all","sqrt","log2","onethird")@classmethoddef_train(cls,data:RDD[LabeledPoint],algo:str,numClasses:int,categoricalFeaturesInfo:Dict[int,int],numTrees:int,featureSubsetStrategy:str,impurity:str,maxDepth:int,maxBins:int,seed:Optional[int],)->RandomForestModel:first=data.first()assertisinstance(first,LabeledPoint),"the data should be RDD of LabeledPoint"iffeatureSubsetStrategynotincls.supportedFeatureSubsetStrategies:raiseValueError("unsupported featureSubsetStrategy: %s"%featureSubsetStrategy)ifseedisNone:seed=random.randint(0,1<<30)model=callMLlibFunc("trainRandomForestModel",data,algo,numClasses,categoricalFeaturesInfo,numTrees,featureSubsetStrategy,impurity,maxDepth,maxBins,seed,)returnRandomForestModel(model)
[docs]@classmethoddeftrainClassifier(cls,data:RDD[LabeledPoint],numClasses:int,categoricalFeaturesInfo:Dict[int,int],numTrees:int,featureSubsetStrategy:str="auto",impurity:str="gini",maxDepth:int=4,maxBins:int=32,seed:Optional[int]=None,)->RandomForestModel:""" Train a random forest model for binary or multiclass classification. .. versionadded:: 1.2.0 Parameters ---------- data : :py:class:`pyspark.RDD` Training dataset: RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. numClasses : int Number of classes for classification. categoricalFeaturesInfo : dict Map storing arity of categorical features. An entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. numTrees : int Number of trees in the random forest. featureSubsetStrategy : str, optional Number of features to consider for splits at each node. Supported values: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt". (default: "auto") impurity : str, optional Criterion used for information gain calculation. Supported values: "gini" or "entropy". (default: "gini") maxDepth : int, optional Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). (default: 4) maxBins : int, optional Maximum number of bins used for splitting features. (default: 32) seed : int, Optional Random seed for bootstrapping and choosing feature subsets. Set as None to generate seed based on system time. (default: None) Returns ------- :py:class:`RandomForestModel` that can be used for prediction. Examples -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> >>> data = [ ... LabeledPoint(0.0, [0.0]), ... LabeledPoint(0.0, [1.0]), ... LabeledPoint(1.0, [2.0]), ... LabeledPoint(1.0, [3.0]) ... ] >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42) >>> model.numTrees() 3 >>> model.totalNumNodes() 7 >>> print(model) TreeEnsembleModel classifier with 3 trees >>> print(model.toDebugString()) TreeEnsembleModel classifier with 3 trees Tree 0: Predict: 1.0 Tree 1: If (feature 0 <= 1.5) Predict: 0.0 Else (feature 0 > 1.5) Predict: 1.0 Tree 2: If (feature 0 <= 1.5) Predict: 0.0 Else (feature 0 > 1.5) Predict: 1.0 >>> model.predict([2.0]) 1.0 >>> model.predict([0.0]) 0.0 >>> rdd = sc.parallelize([[3.0], [1.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """returncls._train(data,"classification",numClasses,categoricalFeaturesInfo,numTrees,featureSubsetStrategy,impurity,maxDepth,maxBins,seed,)
[docs]@classmethoddeftrainRegressor(cls,data:RDD[LabeledPoint],categoricalFeaturesInfo:Dict[int,int],numTrees:int,featureSubsetStrategy:str="auto",impurity:str="variance",maxDepth:int=4,maxBins:int=32,seed:Optional[int]=None,)->RandomForestModel:""" Train a random forest model for regression. .. versionadded:: 1.2.0 Parameters ---------- data : :py:class:`pyspark.RDD` Training dataset: RDD of LabeledPoint. Labels are real numbers. categoricalFeaturesInfo : dict Map storing arity of categorical features. An entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. numTrees : int Number of trees in the random forest. featureSubsetStrategy : str, optional Number of features to consider for splits at each node. Supported values: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "onethird" for regression. (default: "auto") impurity : str, optional Criterion used for information gain calculation. The only supported value for regression is "variance". (default: "variance") maxDepth : int, optional Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). (default: 4) maxBins : int, optional Maximum number of bins used for splitting features. (default: 32) seed : int, optional Random seed for bootstrapping and choosing feature subsets. Set as None to generate seed based on system time. (default: None) Returns ------- :py:class:`RandomForestModel` that can be used for prediction. Examples -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> from pyspark.mllib.linalg import SparseVector >>> >>> sparse_data = [ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ... ] >>> >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42) >>> model.numTrees() 2 >>> model.totalNumNodes() 4 >>> model.predict(SparseVector(2, {1: 1.0})) 1.0 >>> model.predict(SparseVector(2, {0: 1.0})) 0.5 >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) >>> model.predict(rdd).collect() [1.0, 0.5] """returncls._train(data,"regression",0,categoricalFeaturesInfo,numTrees,featureSubsetStrategy,impurity,maxDepth,maxBins,seed,)
[docs]@inherit_docclassGradientBoostedTreesModel(TreeEnsembleModel,JavaLoader["GradientBoostedTreesModel"]):""" Represents a gradient-boosted tree model. .. versionadded:: 1.3.0 """@classmethoddef_java_loader_class(cls)->str:return"org.apache.spark.mllib.tree.model.GradientBoostedTreesModel"
[docs]classGradientBoostedTrees:""" Learning algorithm for a gradient boosted trees model for classification or regression. .. versionadded:: 1.3.0 """@classmethoddef_train(cls,data:RDD[LabeledPoint],algo:str,categoricalFeaturesInfo:Dict[int,int],loss:str,numIterations:int,learningRate:float,maxDepth:int,maxBins:int,)->GradientBoostedTreesModel:first=data.first()assertisinstance(first,LabeledPoint),"the data should be RDD of LabeledPoint"model=callMLlibFunc("trainGradientBoostedTreesModel",data,algo,categoricalFeaturesInfo,loss,numIterations,learningRate,maxDepth,maxBins,)returnGradientBoostedTreesModel(model)
[docs]@classmethoddeftrainClassifier(cls,data:RDD[LabeledPoint],categoricalFeaturesInfo:Dict[int,int],loss:str="logLoss",numIterations:int=100,learningRate:float=0.1,maxDepth:int=3,maxBins:int=32,)->GradientBoostedTreesModel:""" Train a gradient-boosted trees model for classification. .. versionadded:: 1.3.0 Parameters ---------- data : :py:class:`pyspark.RDD` Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. categoricalFeaturesInfo : dict Map storing arity of categorical features. An entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. loss : str, optional Loss function used for minimization during gradient boosting. Supported values: "logLoss", "leastSquaresError", "leastAbsoluteError". (default: "logLoss") numIterations : int, optional Number of iterations of boosting. (default: 100) learningRate : float, optional Learning rate for shrinking the contribution of each estimator. The learning rate should be between in the interval (0, 1]. (default: 0.1) maxDepth : int, optional Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). (default: 3) maxBins : int, optional Maximum number of bins used for splitting features. DecisionTree requires maxBins >= max categories. (default: 32) Returns ------- :py:class:`GradientBoostedTreesModel` that can be used for prediction. Examples -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> >>> data = [ ... LabeledPoint(0.0, [0.0]), ... LabeledPoint(0.0, [1.0]), ... LabeledPoint(1.0, [2.0]), ... LabeledPoint(1.0, [3.0]) ... ] >>> >>> model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), {}, numIterations=10) >>> model.numTrees() 10 >>> model.totalNumNodes() 30 >>> print(model) # it already has newline TreeEnsembleModel classifier with 10 trees >>> model.predict([2.0]) 1.0 >>> model.predict([0.0]) 0.0 >>> rdd = sc.parallelize([[2.0], [0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """returncls._train(data,"classification",categoricalFeaturesInfo,loss,numIterations,learningRate,maxDepth,maxBins,)
[docs]@classmethoddeftrainRegressor(cls,data:RDD[LabeledPoint],categoricalFeaturesInfo:Dict[int,int],loss:str="leastSquaresError",numIterations:int=100,learningRate:float=0.1,maxDepth:int=3,maxBins:int=32,)->GradientBoostedTreesModel:""" Train a gradient-boosted trees model for regression. .. versionadded:: 1.3.0 Parameters ---------- data : Training dataset: RDD of LabeledPoint. Labels are real numbers. categoricalFeaturesInfo : dict Map storing arity of categorical features. An entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. loss : str, optional Loss function used for minimization during gradient boosting. Supported values: "logLoss", "leastSquaresError", "leastAbsoluteError". (default: "leastSquaresError") numIterations : int, optional Number of iterations of boosting. (default: 100) learningRate : float, optional Learning rate for shrinking the contribution of each estimator. The learning rate should be between in the interval (0, 1]. (default: 0.1) maxDepth : int, optional Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). (default: 3) maxBins : int, optional Maximum number of bins used for splitting features. DecisionTree requires maxBins >= max categories. (default: 32) Returns ------- :py:class:`GradientBoostedTreesModel` that can be used for prediction. Examples -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> from pyspark.mllib.linalg import SparseVector >>> >>> sparse_data = [ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ... ] >>> >>> data = sc.parallelize(sparse_data) >>> model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10) >>> model.numTrees() 10 >>> model.totalNumNodes() 12 >>> model.predict(SparseVector(2, {1: 1.0})) 1.0 >>> model.predict(SparseVector(2, {0: 1.0})) 0.0 >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """returncls._train(data,"regression",categoricalFeaturesInfo,loss,numIterations,learningRate,maxDepth,maxBins,)