import numpy as np
# 画图工具
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# 导入 tree 模型和数据集加载工具
from sklearn import tree, datasets
# 导入拆分工具
from sklearn.model_selection import train_test_split

wine=datasets.load_wine()
# 只选取数据集的前2个特征,为了图形方便展示
X=wine.data[:, :2]
y=wine.target
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

# 设定决策树分类器的最大深度为1
clf=tree.DecisionTreeClassifier(max_depth=1)
# 拟合
clf.fit(X_train, y_train)

# 输出打分
print("training score: {:.3f}".format(clf.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf.score(X_test, y_test)) )

# 最关键的参数就是 max_depth，就是问问题的数量，只能回答yes / no.
# 问的问题越多，表示决策树的深度越深。

training score: 0.692
testing score: 0.622


# 可视化

# 定义图像中分区的颜色和散点的颜色
cmap_light=ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold=ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

# 分别用样本的2个特征创建图形和x/y轴
x_min, x_max= X_train[:, 0].min()-1,  X_train[:, 0].max()+1
y_min, y_max= X_train[:, 1].min()-1,  X_train[:, 1].max()+1

xx, yy=np.meshgrid(np.arange(x_min, x_max, 0.02), 
                  np.arange(y_min, y_max, 0.02))
Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])

# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')

# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: tree( max_depth = 1)")
plt.show()


# 尝试加大深度 3
clf2=tree.DecisionTreeClassifier(max_depth=3)
# 拟合
clf2.fit(X_train, y_train)

# 输出打分
print("training score: {:.3f}".format(clf2.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf2.score(X_test, y_test)) )

training score: 0.887
testing score: 0.822


# 可视化
Z=clf2.predict(np.c_[xx.ravel(), yy.ravel()])

# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')

# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: tree( max_depth = 3)")
plt.show()


# 尝试加大深度 5
clf3=tree.DecisionTreeClassifier(max_depth=5)
# 拟合
clf3.fit(X_train, y_train)

# 输出打分
print("training score: {:.3f}".format(clf3.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf3.score(X_test, y_test)) )

# 出现过拟合倾向了，就是训练集效果远好于测试集。

training score: 0.925
testing score: 0.778


# 可视化
Z=clf3.predict(np.c_[xx.ravel(), yy.ravel()])

# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')

# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: tree( max_depth = 5)")
plt.show()


import graphviz
from sklearn.tree import export_graphviz

# 选择 max_depth=3 的决策树进行可视化
# 输出到文件
export_graphviz(clf2, out_file="wine.dot", class_names=wine.target_names,
               feature_names=wine.feature_names[:2], impurity=False, filled=True)

# 读文件
with open("wine.dot") as f:
    dot_graph=f.read()
# 可视化
graphviz.Source(dot_graph)

# 这种层级关系非常方便向非专业人士解释算法是如何工作的。


def getScore(depth):
    # 尝试加大深度 5
    clf=tree.DecisionTreeClassifier(max_depth=depth)
    # 拟合
    clf.fit(X_train, y_train)

    # 输出打分
    return [clf.score(X_train, y_train), clf.score(X_test, y_test)]

scores=[]
for i in range(1, 10):
    scores.append( getScore(i) )

scores=np.array(scores)

scores

array([[0.69172932, 0.62222222],
       [0.82706767, 0.77777778],
       [0.88721805, 0.82222222],
       [0.90977444, 0.77777778],
       [0.92481203, 0.77777778],
       [0.94736842, 0.73333333],
       [0.96992481, 0.75555556],
       [0.9924812 , 0.73333333],
       [1.        , 0.73333333]])


plt.plot(scores[:,0], label="trainning score")
plt.plot(scores[:,1], label="testing score")
plt.xlabel("Max_depth")
plt.ylabel("Score")
plt.legend()
plt.show()


# 导入随机森林分类器
from sklearn.ensemble import RandomForestClassifier
# 导入拆分工具
from sklearn.model_selection import train_test_split

from sklearn import tree, datasets
wine=datasets.load_wine()
# 只选取数据集的前2个特征,为了图形方便展示
X=wine.data[:, :2]
y=wine.target
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

# 设定随机森林有6棵树
forest=RandomForestClassifier(n_estimators=6, random_state=3)
# 拟合
forest.fit(X_train, y_train)

# 输出打分
print("training score: {:.3f}".format(forest.score(X_train, y_train)) )
print("testing score: {:.3f}".format(forest.score(X_test, y_test)) )

# 测试集的结果比训练集明显差，已经过拟合了。

training score: 0.977
testing score: 0.778


help(RandomForestClassifier) 
# bootstrap=True 是一个重要的参数，也是默认值。
#   每棵树都是随机的样本，而每棵树也会选择不同的特征，保证每棵树都是不同的。

# max_feature 也是一个重要的参数，默认是 auto=sqrt(特征数量)。太少则每棵树差异太大，太大则每棵树基本都一样。
# n_estimators 是决策树的数量。这些树的概率投票决定着随机森林的输出。

Help on class RandomForestClassifier in module sklearn.ensemble._forest:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : int, default=100
 |      The number of trees in the forest.
 |  
 |      .. versionchanged:: 0.22
 |         The default value of ``n_estimators`` changed from 10 to 100
 |         in 0.22.
 |  
 |  criterion : {"gini", "entropy"}, default="gini"
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "entropy" for the information gain.
 |      Note: this parameter is tree-specific.
 |  
 |  max_depth : int, default=None
 |      The maximum depth of the tree. If None, then nodes are expanded until
 |      all leaves are pure or until all leaves contain less than
 |      min_samples_split samples.
 |  
 |  min_samples_split : int or float, default=2
 |      The minimum number of samples required to split an internal node:
 |  
 |      - If int, then consider `min_samples_split` as the minimum number.
 |      - If float, then `min_samples_split` is a fraction and
 |        `ceil(min_samples_split * n_samples)` are the minimum
 |        number of samples for each split.
 |  
 |      .. versionchanged:: 0.18
 |         Added float values for fractions.
 |  
 |  min_samples_leaf : int or float, default=1
 |      The minimum number of samples required to be at a leaf node.
 |      A split point at any depth will only be considered if it leaves at
 |      least ``min_samples_leaf`` training samples in each of the left and
 |      right branches.  This may have the effect of smoothing the model,
 |      especially in regression.
 |  
 |      - If int, then consider `min_samples_leaf` as the minimum number.
 |      - If float, then `min_samples_leaf` is a fraction and
 |        `ceil(min_samples_leaf * n_samples)` are the minimum
 |        number of samples for each node.
 |  
 |      .. versionchanged:: 0.18
 |         Added float values for fractions.
 |  
 |  min_weight_fraction_leaf : float, default=0.0
 |      The minimum weighted fraction of the sum total of weights (of all
 |      the input samples) required to be at a leaf node. Samples have
 |      equal weight when sample_weight is not provided.
 |  
 |  max_features : {"auto", "sqrt", "log2"}, int or float, default="auto"
 |      The number of features to consider when looking for the best split:
 |  
 |      - If int, then consider `max_features` features at each split.
 |      - If float, then `max_features` is a fraction and
 |        `round(max_features * n_features)` features are considered at each
 |        split.
 |      - If "auto", then `max_features=sqrt(n_features)`.
 |      - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto").
 |      - If "log2", then `max_features=log2(n_features)`.
 |      - If None, then `max_features=n_features`.
 |  
 |      Note: the search for a split does not stop until at least one
 |      valid partition of the node samples is found, even if it requires to
 |      effectively inspect more than ``max_features`` features.
 |  
 |  max_leaf_nodes : int, default=None
 |      Grow trees with ``max_leaf_nodes`` in best-first fashion.
 |      Best nodes are defined as relative reduction in impurity.
 |      If None then unlimited number of leaf nodes.
 |  
 |  min_impurity_decrease : float, default=0.0
 |      A node will be split if this split induces a decrease of the impurity
 |      greater than or equal to this value.
 |  
 |      The weighted impurity decrease equation is the following::
 |  
 |          N_t / N * (impurity - N_t_R / N_t * right_impurity
 |                              - N_t_L / N_t * left_impurity)
 |  
 |      where ``N`` is the total number of samples, ``N_t`` is the number of
 |      samples at the current node, ``N_t_L`` is the number of samples in the
 |      left child, and ``N_t_R`` is the number of samples in the right child.
 |  
 |      ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
 |      if ``sample_weight`` is passed.
 |  
 |      .. versionadded:: 0.19
 |  
 |  bootstrap : bool, default=True
 |      Whether bootstrap samples are used when building trees. If False, the
 |      whole dataset is used to build each tree.
 |  
 |  oob_score : bool, default=False
 |      Whether to use out-of-bag samples to estimate the generalization score.
 |      Only available if bootstrap=True.
 |  
 |  n_jobs : int, default=None
 |      The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
 |      :meth:`decision_path` and :meth:`apply` are all parallelized over the
 |      trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
 |      context. ``-1`` means using all processors. See :term:`Glossary
 |      <n_jobs>` for more details.
 |  
 |  random_state : int, RandomState instance or None, default=None
 |      Controls both the randomness of the bootstrapping of the samples used
 |      when building trees (if ``bootstrap=True``) and the sampling of the
 |      features to consider when looking for the best split at each node
 |      (if ``max_features < n_features``).
 |      See :term:`Glossary <random_state>` for details.
 |  
 |  verbose : int, default=0
 |      Controls the verbosity when fitting and predicting.
 |  
 |  warm_start : bool, default=False
 |      When set to ``True``, reuse the solution of the previous call to fit
 |      and add more estimators to the ensemble, otherwise, just fit a whole
 |      new forest. See :term:`the Glossary <warm_start>`.
 |  
 |  class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts,             default=None
 |      Weights associated with classes in the form ``{class_label: weight}``.
 |      If not given, all classes are supposed to have weight one. For
 |      multi-output problems, a list of dicts can be provided in the same
 |      order as the columns of y.
 |  
 |      Note that for multioutput (including multilabel) weights should be
 |      defined for each class of every column in its own dict. For example,
 |      for four-class multilabel classification weights should be
 |      [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
 |      [{1:1}, {2:5}, {3:1}, {4:1}].
 |  
 |      The "balanced" mode uses the values of y to automatically adjust
 |      weights inversely proportional to class frequencies in the input data
 |      as ``n_samples / (n_classes * np.bincount(y))``
 |  
 |      The "balanced_subsample" mode is the same as "balanced" except that
 |      weights are computed based on the bootstrap sample for every tree
 |      grown.
 |  
 |      For multi-output, the weights of each column of y will be multiplied.
 |  
 |      Note that these weights will be multiplied with sample_weight (passed
 |      through the fit method) if sample_weight is specified.
 |  
 |  ccp_alpha : non-negative float, default=0.0
 |      Complexity parameter used for Minimal Cost-Complexity Pruning. The
 |      subtree with the largest cost complexity that is smaller than
 |      ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
 |      :ref:`minimal_cost_complexity_pruning` for details.
 |  
 |      .. versionadded:: 0.22
 |  
 |  max_samples : int or float, default=None
 |      If bootstrap is True, the number of samples to draw from X
 |      to train each base estimator.
 |  
 |      - If None (default), then draw `X.shape[0]` samples.
 |      - If int, then draw `max_samples` samples.
 |      - If float, then draw `max_samples * X.shape[0]` samples. Thus,
 |        `max_samples` should be in the interval `(0.0, 1.0]`.
 |  
 |      .. versionadded:: 0.22
 |  
 |  Attributes
 |  ----------
 |  base_estimator_ : DecisionTreeClassifier
 |      The child estimator template used to create the collection of fitted
 |      sub-estimators.
 |  
 |  estimators_ : list of DecisionTreeClassifier
 |      The collection of fitted sub-estimators.
 |  
 |  classes_ : ndarray of shape (n_classes,) or a list of such arrays
 |      The classes labels (single output problem), or a list of arrays of
 |      class labels (multi-output problem).
 |  
 |  n_classes_ : int or list
 |      The number of classes (single output problem), or a list containing the
 |      number of classes for each output (multi-output problem).
 |  
 |  n_features_ : int
 |      The number of features when ``fit`` is performed.
 |  
 |      .. deprecated:: 1.0
 |          Attribute `n_features_` was deprecated in version 1.0 and will be
 |          removed in 1.2. Use `n_features_in_` instead.
 |  
 |  n_features_in_ : int
 |      Number of features seen during :term:`fit`.
 |  
 |      .. versionadded:: 0.24
 |  
 |  feature_names_in_ : ndarray of shape (`n_features_in_`,)
 |      Names of features seen during :term:`fit`. Defined only when `X`
 |      has feature names that are all strings.
 |      .. versionadded:: 1.0
 |  
 |  n_outputs_ : int
 |      The number of outputs when ``fit`` is performed.
 |  
 |  feature_importances_ : ndarray of shape (n_features,)
 |      The impurity-based feature importances.
 |      The higher, the more important the feature.
 |      The importance of a feature is computed as the (normalized)
 |      total reduction of the criterion brought by that feature.  It is also
 |      known as the Gini importance.
 |  
 |      Warning: impurity-based feature importances can be misleading for
 |      high cardinality features (many unique values). See
 |      :func:`sklearn.inspection.permutation_importance` as an alternative.
 |  
 |  oob_score_ : float
 |      Score of the training dataset obtained using an out-of-bag estimate.
 |      This attribute exists only when ``oob_score`` is True.
 |  
 |  oob_decision_function_ : ndarray of shape (n_samples, n_classes) or             (n_samples, n_classes, n_outputs)
 |      Decision function computed with out-of-bag estimate on the training
 |      set. If n_estimators is small it might be possible that a data point
 |      was never left out during the bootstrap. In this case,
 |      `oob_decision_function_` might contain NaN. This attribute exists
 |      only when ``oob_score`` is True.
 |  
 |  See Also
 |  --------
 |  sklearn.tree.DecisionTreeClassifier : A decision tree classifier.
 |  sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized
 |      tree classifiers.
 |  
 |  Notes
 |  -----
 |  The default values for the parameters controlling the size of the trees
 |  (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
 |  unpruned trees which can potentially be very large on some data sets. To
 |  reduce memory consumption, the complexity and size of the trees should be
 |  controlled by setting those parameter values.
 |  
 |  The features are always randomly permuted at each split. Therefore,
 |  the best found split may vary, even with the same training data,
 |  ``max_features=n_features`` and ``bootstrap=False``, if the improvement
 |  of the criterion is identical for several splits enumerated during the
 |  search of the best split. To obtain a deterministic behaviour during
 |  fitting, ``random_state`` has to be fixed.
 |  
 |  References
 |  ----------
 |  .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
 |  
 |  Examples
 |  --------
 |  >>> from sklearn.ensemble import RandomForestClassifier
 |  >>> from sklearn.datasets import make_classification
 |  >>> X, y = make_classification(n_samples=1000, n_features=4,
 |  ...                            n_informative=2, n_redundant=0,
 |  ...                            random_state=0, shuffle=False)
 |  >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
 |  >>> clf.fit(X, y)
 |  RandomForestClassifier(...)
 |  >>> print(clf.predict([[0, 0, 0, 0]]))
 |  [1]
 |  
 |  Method resolution order:
 |      RandomForestClassifier
 |      ForestClassifier
 |      sklearn.base.ClassifierMixin
 |      BaseForest
 |      sklearn.base.MultiOutputMixin
 |      sklearn.ensemble._base.BaseEnsemble
 |      sklearn.base.MetaEstimatorMixin
 |      sklearn.base.BaseEstimator
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset()
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from ForestClassifier:
 |  
 |  predict(self, X)
 |      Predict class for X.
 |      
 |      The predicted class of an input sample is a vote by the trees in
 |      the forest, weighted by their probability estimates. That is,
 |      the predicted class is the one with highest mean probability
 |      estimate across the trees.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
 |          The predicted classes.
 |  
 |  predict_log_proba(self, X)
 |      Predict class log-probabilities for X.
 |      
 |      The predicted class log-probabilities of an input sample is computed as
 |      the log of the mean predicted class probabilities of the trees in the
 |      forest.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      p : ndarray of shape (n_samples, n_classes), or a list of such arrays
 |          The class probabilities of the input samples. The order of the
 |          classes corresponds to that in the attribute :term:`classes_`.
 |  
 |  predict_proba(self, X)
 |      Predict class probabilities for X.
 |      
 |      The predicted class probabilities of an input sample are computed as
 |      the mean predicted class probabilities of the trees in the forest.
 |      The class probability of a single tree is the fraction of samples of
 |      the same class in a leaf.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      p : ndarray of shape (n_samples, n_classes), or a list of such arrays
 |          The class probabilities of the input samples. The order of the
 |          classes corresponds to that in the attribute :term:`classes_`.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.ClassifierMixin:
 |  
 |  score(self, X, y, sample_weight=None)
 |      Return the mean accuracy on the given test data and labels.
 |      
 |      In multi-label classification, this is the subset accuracy
 |      which is a harsh metric since you require for each sample that
 |      each label set be correctly predicted.
 |      
 |      Parameters
 |      ----------
 |      X : array-like of shape (n_samples, n_features)
 |          Test samples.
 |      
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          True labels for `X`.
 |      
 |      sample_weight : array-like of shape (n_samples,), default=None
 |          Sample weights.
 |      
 |      Returns
 |      -------
 |      score : float
 |          Mean accuracy of ``self.predict(X)`` wrt. `y`.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.ClassifierMixin:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from BaseForest:
 |  
 |  apply(self, X)
 |      Apply trees in the forest to X, return leaf indices.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      X_leaves : ndarray of shape (n_samples, n_estimators)
 |          For each datapoint x in X and for each tree in the forest,
 |          return the index of the leaf x ends up in.
 |  
 |  decision_path(self, X)
 |      Return the decision path in the forest.
 |      
 |      .. versionadded:: 0.18
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      indicator : sparse matrix of shape (n_samples, n_nodes)
 |          Return a node indicator matrix where non zero elements indicates
 |          that the samples goes through the nodes. The matrix is of CSR
 |          format.
 |      
 |      n_nodes_ptr : ndarray of shape (n_estimators + 1,)
 |          The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
 |          gives the indicator value for the i-th estimator.
 |  
 |  fit(self, X, y, sample_weight=None)
 |      Build a forest of trees from the training set (X, y).
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The training input samples. Internally, its dtype will be converted
 |          to ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csc_matrix``.
 |      
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          The target values (class labels in classification, real numbers in
 |          regression).
 |      
 |      sample_weight : array-like of shape (n_samples,), default=None
 |          Sample weights. If None, then samples are equally weighted. Splits
 |          that would create child nodes with net zero or negative weight are
 |          ignored while searching for a split in each node. In the case of
 |          classification, splits are also ignored if they would result in any
 |          single class carrying a negative weight in either child node.
 |      
 |      Returns
 |      -------
 |      self : object
 |          Fitted estimator.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from BaseForest:
 |  
 |  feature_importances_
 |      The impurity-based feature importances.
 |      
 |      The higher, the more important the feature.
 |      The importance of a feature is computed as the (normalized)
 |      total reduction of the criterion brought by that feature.  It is also
 |      known as the Gini importance.
 |      
 |      Warning: impurity-based feature importances can be misleading for
 |      high cardinality features (many unique values). See
 |      :func:`sklearn.inspection.permutation_importance` as an alternative.
 |      
 |      Returns
 |      -------
 |      feature_importances_ : ndarray of shape (n_features,)
 |          The values of this array sum to 1, unless all trees are single node
 |          trees consisting of only the root node, in which case it will be an
 |          array of zeros.
 |  
 |  n_features_
 |      DEPRECATED: Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.
 |      
 |      Number of features when fitting the estimator.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.ensemble._base.BaseEnsemble:
 |  
 |  __getitem__(self, index)
 |      Return the index'th estimator in the ensemble.
 |  
 |  __iter__(self)
 |      Return iterator over estimators in the ensemble.
 |  
 |  __len__(self)
 |      Return the number of estimators in the ensemble.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from sklearn.ensemble._base.BaseEnsemble:
 |  
 |  __annotations__ = {'_required_parameters': typing.List[str]}
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __getstate__(self)
 |  
 |  __repr__(self, N_CHAR_MAX=700)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep : bool, default=True
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : dict
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
 |      parameters of the form ``<component>__<parameter>`` so that it's
 |      possible to update each component of a nested object.
 |      
 |      Parameters
 |      ----------
 |      **params : dict
 |          Estimator parameters.
 |      
 |      Returns
 |      -------
 |      self : estimator instance
 |          Estimator instance.


## 可视化随机森林
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# 定义图像中分区的颜色和散点的颜色
cmap_light=ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold=ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

# 分别用样本的2个特征创建图形和x/y轴
x_min, x_max= X_train[:, 0].min()-1,  X_train[:, 0].max()+1
y_min, y_max= X_train[:, 1].min()-1,  X_train[:, 1].max()+1

xx, yy=np.meshgrid(np.arange(x_min, x_max, 0.02), 
                  np.arange(y_min, y_max, 0.02))
Z=forest.predict(np.c_[xx.ravel(), yy.ravel()])

# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')

# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: RandomForestClassifier")
plt.show()

# 结果更细腻了。
# 可以调节 n_estimator 参数和 random_state 参数，看分类器的表现怎么变化。


def getScore(n_est):
    forest=RandomForestClassifier(n_estimators=n_est, random_state=3)
    # 拟合
    forest.fit(X_train, y_train)
    # 输出打分
    return [forest.score(X_train, y_train), forest.score(X_test, y_test)]

scores=[]
for i in range(1, 30):
    scores.append( getScore(i) )

scores=np.array(scores)

# 画图
plt.plot(scores[:,0], label="trainning score")
plt.plot(scores[:,1], label="testing score")
plt.xlabel("n_estimator")
plt.ylabel("Score")
plt.legend()
plt.show()

# n_estimator=17 就是打分极限了


def getScore(seed):
    forest=RandomForestClassifier(n_estimators=17, random_state=seed)
    # 拟合
    forest.fit(X_train, y_train)
    # 输出打分
    return [forest.score(X_train, y_train), forest.score(X_test, y_test)]

scores=[]
for i in range(1, 30):
    scores.append( getScore(i) )

scores=np.array(scores)

# 画图
plt.plot(scores[:,0], label="trainning score")
plt.plot(scores[:,1], label="testing score")
plt.xlabel("random_state")
plt.ylabel("Score")
plt.legend()
plt.show()

# random_state 是随机数种子，影响随机取样、取特征等。


import os
os.getcwd()

# 下载 csv文件 
# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names

# 获取列名
# $ tail -n 14 adult.names | awk -F ":" '{print "\""$1"\""}'

'/data/wangjl/web/docs/jupyterlab'


import pandas as pd
data=pd.read_csv("data/adult.data", header=None, index_col=False,
                names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", 
                       "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"])
# 为了方便展示，选取部分列
data_lite=data[ ["age", "workclass","education","sex", "hours-per-week", "occupation", "income"] ]

print(data.shape)
print(data_lite.shape)

data_lite.head()

(32561, 15)
(32561, 7)


import numpy as np
data_lite.loc[:,"workclass"].unique()
# ？ 表示缺失值

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)


# 使用 get_dummies 处理数据，把分类变量变为 0/1 数值型的。
data_dummies=pd.get_dummies(data_lite)

print("样本原始特征:\n", list(data_lite.columns), "\n" )
print("虚拟变量特征:\n", list(data_dummies.columns) )
print( data_dummies.shape)

data_dummies.head()
#可见原来的7列已经扩展成 46 列了。

样本原始特征:
 ['age', 'workclass', 'education', 'sex', 'hours-per-week', 'occupation', 'income'] 

虚拟变量特征:
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'sex_ Female', 'sex_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K']
(32561, 46)


## 把数据分配给X和y
features=data_dummies.loc[:, 'age':'occupation_ Transport-moving']
X=features.values
# 将收入大于50k作为预测目标
y=data_dummies["income_ >50K"].values

# 维度
print(X.shape, y.shape)

(32561, 44) (32561,)


# 导入拆分工具
from sklearn.model_selection import train_test_split
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

############
from sklearn import tree
# 设定决策树分类器的最大深度为5
clf=tree.DecisionTreeClassifier(max_depth=5)
# 拟合
clf.fit(X_train, y_train)

# 输出打分
print("training score: {:.3f}".format(clf.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf.score(X_test, y_test)) )
print()


############
# 导入随机森林分类器
from sklearn.ensemble import RandomForestClassifier
# 设定随机森林有6棵树
rfc=RandomForestClassifier(n_estimators=7, random_state=3)
# 拟合
rfc.fit(X_train, y_train)

# 输出打分
print("training score: {:.3f}".format(rfc.score(X_train, y_train)) )
print("testing score: {:.3f}".format(rfc.score(X_test, y_test)) )

training score: 0.803
testing score: 0.796

training score: 0.928
testing score: 0.784

决策树¶

决策树的构建（最大决策深度=1）¶

max_depth=3¶

max_depth=5¶

决策树的可视化演示¶

max_depth与打分曲线¶

随机森林¶

继续使用 wine 数据集¶

n_estimator与打分曲线¶

真实数据 - 预测收入¶

读取数据¶

数据预处理: 分类变量 to 哑变量¶

建模¶

	age	workclass	education	sex	hours-per-week	occupation	income
0	39	State-gov	Bachelors	Male	40	Adm-clerical	<=50K
1	50	Self-emp-not-inc	Bachelors	Male	13	Exec-managerial	<=50K
2	38	Private	HS-grad	Male	40	Handlers-cleaners	<=50K
3	53	Private	11th	Male	40	Handlers-cleaners	<=50K
4	28	Private	Bachelors	Female	40	Prof-specialty	<=50K

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0	0	1	...	0	1
1	50	13	0	1	0	...	0	1
2	38	40	1	0	0	...	0	1
3	53	40	1	0	0	...	0	1
4	28	40	1	0	0	...	1	1

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0	0	1	...	0	1
1	50	13	0	1	0	...	0	1
2	38	40	1	0	0	...	0	1
3	53	40	1	0	0	...	0	1
4	28	40	1	0	0	...	1	1

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0	0	1	...	0	1
1	50	13	0	1	0	...	0	1
2	38	40	1	0	0	...	0	1
3	53	40	1	0	0	...	0	1
4	28	40	1	0	0	...	1	1