import numpy as np
# 画图工具
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# 导入 tree 模型和数据集加载工具
from sklearn import tree, datasets
# 导入拆分工具
from sklearn.model_selection import train_test_split
wine=datasets.load_wine()
# 只选取数据集的前2个特征,为了图形方便展示
X=wine.data[:, :2]
y=wine.target
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)
# 设定决策树分类器的最大深度为1
clf=tree.DecisionTreeClassifier(max_depth=1)
# 拟合
clf.fit(X_train, y_train)
# 输出打分
print("training score: {:.3f}".format(clf.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf.score(X_test, y_test)) )
# 最关键的参数就是 max_depth,就是问问题的数量,只能回答yes / no.
# 问的问题越多,表示决策树的深度越深。
training score: 0.692 testing score: 0.622
# 可视化
# 定义图像中分区的颜色和散点的颜色
cmap_light=ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold=ListedColormap(["#FF0000", "#00FF00", "#0000FF"])
# 分别用样本的2个特征创建图形和x/y轴
x_min, x_max= X_train[:, 0].min()-1, X_train[:, 0].max()+1
y_min, y_max= X_train[:, 1].min()-1, X_train[:, 1].max()+1
xx, yy=np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])
# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')
# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: tree( max_depth = 1)")
plt.show()
只分2类,分类效果不好,不到 70%。
# 尝试加大深度 3
clf2=tree.DecisionTreeClassifier(max_depth=3)
# 拟合
clf2.fit(X_train, y_train)
# 输出打分
print("training score: {:.3f}".format(clf2.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf2.score(X_test, y_test)) )
training score: 0.887 testing score: 0.822
# 可视化
Z=clf2.predict(np.c_[xx.ravel(), yy.ravel()])
# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')
# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: tree( max_depth = 3)")
plt.show()
# 尝试加大深度 5
clf3=tree.DecisionTreeClassifier(max_depth=5)
# 拟合
clf3.fit(X_train, y_train)
# 输出打分
print("training score: {:.3f}".format(clf3.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf3.score(X_test, y_test)) )
# 出现过拟合倾向了,就是训练集效果远好于测试集。
training score: 0.925 testing score: 0.778
# 可视化
Z=clf3.predict(np.c_[xx.ravel(), yy.ravel()])
# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')
# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: tree( max_depth = 5)")
plt.show()
pip3 install graphviz -i https://pypi.douban.com/simple/
import graphviz
from sklearn.tree import export_graphviz
# 选择 max_depth=3 的决策树进行可视化
# 输出到文件
export_graphviz(clf2, out_file="wine.dot", class_names=wine.target_names,
feature_names=wine.feature_names[:2], impurity=False, filled=True)
# 读文件
with open("wine.dot") as f:
dot_graph=f.read()
# 可视化
graphviz.Source(dot_graph)
# 这种层级关系非常方便向非专业人士解释算法是如何工作的。
def getScore(depth):
# 尝试加大深度 5
clf=tree.DecisionTreeClassifier(max_depth=depth)
# 拟合
clf.fit(X_train, y_train)
# 输出打分
return [clf.score(X_train, y_train), clf.score(X_test, y_test)]
scores=[]
for i in range(1, 10):
scores.append( getScore(i) )
scores=np.array(scores)
scores
array([[0.69172932, 0.62222222], [0.82706767, 0.77777778], [0.88721805, 0.82222222], [0.90977444, 0.77777778], [0.92481203, 0.77777778], [0.94736842, 0.73333333], [0.96992481, 0.75555556], [0.9924812 , 0.73333333], [1. , 0.73333333]])
plt.plot(scores[:,0], label="trainning score")
plt.plot(scores[:,1], label="testing score")
plt.xlabel("Max_depth")
plt.ylabel("Score")
plt.legend()
plt.show()
随机森林是把不同的几棵决策树打包到一起,每棵树的参数都不相同,然后我们取每棵树预测结果的平均值。
# 导入随机森林分类器
from sklearn.ensemble import RandomForestClassifier
# 导入拆分工具
from sklearn.model_selection import train_test_split
from sklearn import tree, datasets
wine=datasets.load_wine()
# 只选取数据集的前2个特征,为了图形方便展示
X=wine.data[:, :2]
y=wine.target
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)
# 设定随机森林有6棵树
forest=RandomForestClassifier(n_estimators=6, random_state=3)
# 拟合
forest.fit(X_train, y_train)
# 输出打分
print("training score: {:.3f}".format(forest.score(X_train, y_train)) )
print("testing score: {:.3f}".format(forest.score(X_test, y_test)) )
# 测试集的结果比训练集明显差,已经过拟合了。
training score: 0.977 testing score: 0.778
help(RandomForestClassifier)
# bootstrap=True 是一个重要的参数,也是默认值。
# 每棵树都是随机的样本,而每棵树也会选择不同的特征,保证每棵树都是不同的。
# max_feature 也是一个重要的参数,默认是 auto=sqrt(特征数量)。太少则每棵树差异太大,太大则每棵树基本都一样。
# n_estimators 是决策树的数量。这些树的概率投票决定着随机森林的输出。
Help on class RandomForestClassifier in module sklearn.ensemble._forest: class RandomForestClassifier(ForestClassifier) | RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None) | | A random forest classifier. | | A random forest is a meta estimator that fits a number of decision tree | classifiers on various sub-samples of the dataset and uses averaging to | improve the predictive accuracy and control over-fitting. | The sub-sample size is controlled with the `max_samples` parameter if | `bootstrap=True` (default), otherwise the whole dataset is used to build | each tree. | | Read more in the :ref:`User Guide <forest>`. | | Parameters | ---------- | n_estimators : int, default=100 | The number of trees in the forest. | | .. versionchanged:: 0.22 | The default value of ``n_estimators`` changed from 10 to 100 | in 0.22. | | criterion : {"gini", "entropy"}, default="gini" | The function to measure the quality of a split. Supported criteria are | "gini" for the Gini impurity and "entropy" for the information gain. | Note: this parameter is tree-specific. | | max_depth : int, default=None | The maximum depth of the tree. If None, then nodes are expanded until | all leaves are pure or until all leaves contain less than | min_samples_split samples. | | min_samples_split : int or float, default=2 | The minimum number of samples required to split an internal node: | | - If int, then consider `min_samples_split` as the minimum number. | - If float, then `min_samples_split` is a fraction and | `ceil(min_samples_split * n_samples)` are the minimum | number of samples for each split. | | .. versionchanged:: 0.18 | Added float values for fractions. | | min_samples_leaf : int or float, default=1 | The minimum number of samples required to be at a leaf node. | A split point at any depth will only be considered if it leaves at | least ``min_samples_leaf`` training samples in each of the left and | right branches. This may have the effect of smoothing the model, | especially in regression. | | - If int, then consider `min_samples_leaf` as the minimum number. | - If float, then `min_samples_leaf` is a fraction and | `ceil(min_samples_leaf * n_samples)` are the minimum | number of samples for each node. | | .. versionchanged:: 0.18 | Added float values for fractions. | | min_weight_fraction_leaf : float, default=0.0 | The minimum weighted fraction of the sum total of weights (of all | the input samples) required to be at a leaf node. Samples have | equal weight when sample_weight is not provided. | | max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" | The number of features to consider when looking for the best split: | | - If int, then consider `max_features` features at each split. | - If float, then `max_features` is a fraction and | `round(max_features * n_features)` features are considered at each | split. | - If "auto", then `max_features=sqrt(n_features)`. | - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). | - If "log2", then `max_features=log2(n_features)`. | - If None, then `max_features=n_features`. | | Note: the search for a split does not stop until at least one | valid partition of the node samples is found, even if it requires to | effectively inspect more than ``max_features`` features. | | max_leaf_nodes : int, default=None | Grow trees with ``max_leaf_nodes`` in best-first fashion. | Best nodes are defined as relative reduction in impurity. | If None then unlimited number of leaf nodes. | | min_impurity_decrease : float, default=0.0 | A node will be split if this split induces a decrease of the impurity | greater than or equal to this value. | | The weighted impurity decrease equation is the following:: | | N_t / N * (impurity - N_t_R / N_t * right_impurity | - N_t_L / N_t * left_impurity) | | where ``N`` is the total number of samples, ``N_t`` is the number of | samples at the current node, ``N_t_L`` is the number of samples in the | left child, and ``N_t_R`` is the number of samples in the right child. | | ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, | if ``sample_weight`` is passed. | | .. versionadded:: 0.19 | | bootstrap : bool, default=True | Whether bootstrap samples are used when building trees. If False, the | whole dataset is used to build each tree. | | oob_score : bool, default=False | Whether to use out-of-bag samples to estimate the generalization score. | Only available if bootstrap=True. | | n_jobs : int, default=None | The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, | :meth:`decision_path` and :meth:`apply` are all parallelized over the | trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` | context. ``-1`` means using all processors. See :term:`Glossary | <n_jobs>` for more details. | | random_state : int, RandomState instance or None, default=None | Controls both the randomness of the bootstrapping of the samples used | when building trees (if ``bootstrap=True``) and the sampling of the | features to consider when looking for the best split at each node | (if ``max_features < n_features``). | See :term:`Glossary <random_state>` for details. | | verbose : int, default=0 | Controls the verbosity when fitting and predicting. | | warm_start : bool, default=False | When set to ``True``, reuse the solution of the previous call to fit | and add more estimators to the ensemble, otherwise, just fit a whole | new forest. See :term:`the Glossary <warm_start>`. | | class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, default=None | Weights associated with classes in the form ``{class_label: weight}``. | If not given, all classes are supposed to have weight one. For | multi-output problems, a list of dicts can be provided in the same | order as the columns of y. | | Note that for multioutput (including multilabel) weights should be | defined for each class of every column in its own dict. For example, | for four-class multilabel classification weights should be | [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of | [{1:1}, {2:5}, {3:1}, {4:1}]. | | The "balanced" mode uses the values of y to automatically adjust | weights inversely proportional to class frequencies in the input data | as ``n_samples / (n_classes * np.bincount(y))`` | | The "balanced_subsample" mode is the same as "balanced" except that | weights are computed based on the bootstrap sample for every tree | grown. | | For multi-output, the weights of each column of y will be multiplied. | | Note that these weights will be multiplied with sample_weight (passed | through the fit method) if sample_weight is specified. | | ccp_alpha : non-negative float, default=0.0 | Complexity parameter used for Minimal Cost-Complexity Pruning. The | subtree with the largest cost complexity that is smaller than | ``ccp_alpha`` will be chosen. By default, no pruning is performed. See | :ref:`minimal_cost_complexity_pruning` for details. | | .. versionadded:: 0.22 | | max_samples : int or float, default=None | If bootstrap is True, the number of samples to draw from X | to train each base estimator. | | - If None (default), then draw `X.shape[0]` samples. | - If int, then draw `max_samples` samples. | - If float, then draw `max_samples * X.shape[0]` samples. Thus, | `max_samples` should be in the interval `(0.0, 1.0]`. | | .. versionadded:: 0.22 | | Attributes | ---------- | base_estimator_ : DecisionTreeClassifier | The child estimator template used to create the collection of fitted | sub-estimators. | | estimators_ : list of DecisionTreeClassifier | The collection of fitted sub-estimators. | | classes_ : ndarray of shape (n_classes,) or a list of such arrays | The classes labels (single output problem), or a list of arrays of | class labels (multi-output problem). | | n_classes_ : int or list | The number of classes (single output problem), or a list containing the | number of classes for each output (multi-output problem). | | n_features_ : int | The number of features when ``fit`` is performed. | | .. deprecated:: 1.0 | Attribute `n_features_` was deprecated in version 1.0 and will be | removed in 1.2. Use `n_features_in_` instead. | | n_features_in_ : int | Number of features seen during :term:`fit`. | | .. versionadded:: 0.24 | | feature_names_in_ : ndarray of shape (`n_features_in_`,) | Names of features seen during :term:`fit`. Defined only when `X` | has feature names that are all strings. | .. versionadded:: 1.0 | | n_outputs_ : int | The number of outputs when ``fit`` is performed. | | feature_importances_ : ndarray of shape (n_features,) | The impurity-based feature importances. | The higher, the more important the feature. | The importance of a feature is computed as the (normalized) | total reduction of the criterion brought by that feature. It is also | known as the Gini importance. | | Warning: impurity-based feature importances can be misleading for | high cardinality features (many unique values). See | :func:`sklearn.inspection.permutation_importance` as an alternative. | | oob_score_ : float | Score of the training dataset obtained using an out-of-bag estimate. | This attribute exists only when ``oob_score`` is True. | | oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs) | Decision function computed with out-of-bag estimate on the training | set. If n_estimators is small it might be possible that a data point | was never left out during the bootstrap. In this case, | `oob_decision_function_` might contain NaN. This attribute exists | only when ``oob_score`` is True. | | See Also | -------- | sklearn.tree.DecisionTreeClassifier : A decision tree classifier. | sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized | tree classifiers. | | Notes | ----- | The default values for the parameters controlling the size of the trees | (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and | unpruned trees which can potentially be very large on some data sets. To | reduce memory consumption, the complexity and size of the trees should be | controlled by setting those parameter values. | | The features are always randomly permuted at each split. Therefore, | the best found split may vary, even with the same training data, | ``max_features=n_features`` and ``bootstrap=False``, if the improvement | of the criterion is identical for several splits enumerated during the | search of the best split. To obtain a deterministic behaviour during | fitting, ``random_state`` has to be fixed. | | References | ---------- | .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. | | Examples | -------- | >>> from sklearn.ensemble import RandomForestClassifier | >>> from sklearn.datasets import make_classification | >>> X, y = make_classification(n_samples=1000, n_features=4, | ... n_informative=2, n_redundant=0, | ... random_state=0, shuffle=False) | >>> clf = RandomForestClassifier(max_depth=2, random_state=0) | >>> clf.fit(X, y) | RandomForestClassifier(...) | >>> print(clf.predict([[0, 0, 0, 0]])) | [1] | | Method resolution order: | RandomForestClassifier | ForestClassifier | sklearn.base.ClassifierMixin | BaseForest | sklearn.base.MultiOutputMixin | sklearn.ensemble._base.BaseEnsemble | sklearn.base.MetaEstimatorMixin | sklearn.base.BaseEstimator | builtins.object | | Methods defined here: | | __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None) | Initialize self. See help(type(self)) for accurate signature. | | ---------------------------------------------------------------------- | Data and other attributes defined here: | | __abstractmethods__ = frozenset() | | ---------------------------------------------------------------------- | Methods inherited from ForestClassifier: | | predict(self, X) | Predict class for X. | | The predicted class of an input sample is a vote by the trees in | the forest, weighted by their probability estimates. That is, | the predicted class is the one with highest mean probability | estimate across the trees. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, its dtype will be converted to | ``dtype=np.float32``. If a sparse matrix is provided, it will be | converted into a sparse ``csr_matrix``. | | Returns | ------- | y : ndarray of shape (n_samples,) or (n_samples, n_outputs) | The predicted classes. | | predict_log_proba(self, X) | Predict class log-probabilities for X. | | The predicted class log-probabilities of an input sample is computed as | the log of the mean predicted class probabilities of the trees in the | forest. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, its dtype will be converted to | ``dtype=np.float32``. If a sparse matrix is provided, it will be | converted into a sparse ``csr_matrix``. | | Returns | ------- | p : ndarray of shape (n_samples, n_classes), or a list of such arrays | The class probabilities of the input samples. The order of the | classes corresponds to that in the attribute :term:`classes_`. | | predict_proba(self, X) | Predict class probabilities for X. | | The predicted class probabilities of an input sample are computed as | the mean predicted class probabilities of the trees in the forest. | The class probability of a single tree is the fraction of samples of | the same class in a leaf. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, its dtype will be converted to | ``dtype=np.float32``. If a sparse matrix is provided, it will be | converted into a sparse ``csr_matrix``. | | Returns | ------- | p : ndarray of shape (n_samples, n_classes), or a list of such arrays | The class probabilities of the input samples. The order of the | classes corresponds to that in the attribute :term:`classes_`. | | ---------------------------------------------------------------------- | Methods inherited from sklearn.base.ClassifierMixin: | | score(self, X, y, sample_weight=None) | Return the mean accuracy on the given test data and labels. | | In multi-label classification, this is the subset accuracy | which is a harsh metric since you require for each sample that | each label set be correctly predicted. | | Parameters | ---------- | X : array-like of shape (n_samples, n_features) | Test samples. | | y : array-like of shape (n_samples,) or (n_samples, n_outputs) | True labels for `X`. | | sample_weight : array-like of shape (n_samples,), default=None | Sample weights. | | Returns | ------- | score : float | Mean accuracy of ``self.predict(X)`` wrt. `y`. | | ---------------------------------------------------------------------- | Data descriptors inherited from sklearn.base.ClassifierMixin: | | __dict__ | dictionary for instance variables (if defined) | | __weakref__ | list of weak references to the object (if defined) | | ---------------------------------------------------------------------- | Methods inherited from BaseForest: | | apply(self, X) | Apply trees in the forest to X, return leaf indices. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, its dtype will be converted to | ``dtype=np.float32``. If a sparse matrix is provided, it will be | converted into a sparse ``csr_matrix``. | | Returns | ------- | X_leaves : ndarray of shape (n_samples, n_estimators) | For each datapoint x in X and for each tree in the forest, | return the index of the leaf x ends up in. | | decision_path(self, X) | Return the decision path in the forest. | | .. versionadded:: 0.18 | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, its dtype will be converted to | ``dtype=np.float32``. If a sparse matrix is provided, it will be | converted into a sparse ``csr_matrix``. | | Returns | ------- | indicator : sparse matrix of shape (n_samples, n_nodes) | Return a node indicator matrix where non zero elements indicates | that the samples goes through the nodes. The matrix is of CSR | format. | | n_nodes_ptr : ndarray of shape (n_estimators + 1,) | The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] | gives the indicator value for the i-th estimator. | | fit(self, X, y, sample_weight=None) | Build a forest of trees from the training set (X, y). | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The training input samples. Internally, its dtype will be converted | to ``dtype=np.float32``. If a sparse matrix is provided, it will be | converted into a sparse ``csc_matrix``. | | y : array-like of shape (n_samples,) or (n_samples, n_outputs) | The target values (class labels in classification, real numbers in | regression). | | sample_weight : array-like of shape (n_samples,), default=None | Sample weights. If None, then samples are equally weighted. Splits | that would create child nodes with net zero or negative weight are | ignored while searching for a split in each node. In the case of | classification, splits are also ignored if they would result in any | single class carrying a negative weight in either child node. | | Returns | ------- | self : object | Fitted estimator. | | ---------------------------------------------------------------------- | Data descriptors inherited from BaseForest: | | feature_importances_ | The impurity-based feature importances. | | The higher, the more important the feature. | The importance of a feature is computed as the (normalized) | total reduction of the criterion brought by that feature. It is also | known as the Gini importance. | | Warning: impurity-based feature importances can be misleading for | high cardinality features (many unique values). See | :func:`sklearn.inspection.permutation_importance` as an alternative. | | Returns | ------- | feature_importances_ : ndarray of shape (n_features,) | The values of this array sum to 1, unless all trees are single node | trees consisting of only the root node, in which case it will be an | array of zeros. | | n_features_ | DEPRECATED: Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. | | Number of features when fitting the estimator. | | ---------------------------------------------------------------------- | Methods inherited from sklearn.ensemble._base.BaseEnsemble: | | __getitem__(self, index) | Return the index'th estimator in the ensemble. | | __iter__(self) | Return iterator over estimators in the ensemble. | | __len__(self) | Return the number of estimators in the ensemble. | | ---------------------------------------------------------------------- | Data and other attributes inherited from sklearn.ensemble._base.BaseEnsemble: | | __annotations__ = {'_required_parameters': typing.List[str]} | | ---------------------------------------------------------------------- | Methods inherited from sklearn.base.BaseEstimator: | | __getstate__(self) | | __repr__(self, N_CHAR_MAX=700) | Return repr(self). | | __setstate__(self, state) | | get_params(self, deep=True) | Get parameters for this estimator. | | Parameters | ---------- | deep : bool, default=True | If True, will return the parameters for this estimator and | contained subobjects that are estimators. | | Returns | ------- | params : dict | Parameter names mapped to their values. | | set_params(self, **params) | Set the parameters of this estimator. | | The method works on simple estimators as well as on nested objects | (such as :class:`~sklearn.pipeline.Pipeline`). The latter have | parameters of the form ``<component>__<parameter>`` so that it's | possible to update each component of a nested object. | | Parameters | ---------- | **params : dict | Estimator parameters. | | Returns | ------- | self : estimator instance | Estimator instance.
## 可视化随机森林
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# 定义图像中分区的颜色和散点的颜色
cmap_light=ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold=ListedColormap(["#FF0000", "#00FF00", "#0000FF"])
# 分别用样本的2个特征创建图形和x/y轴
x_min, x_max= X_train[:, 0].min()-1, X_train[:, 0].max()+1
y_min, y_max= X_train[:, 1].min()-1, X_train[:, 1].max()+1
xx, yy=np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
Z=forest.predict(np.c_[xx.ravel(), yy.ravel()])
# 给每个分类中的样本分配不同的颜色
Z=Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')
# 样本散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: RandomForestClassifier")
plt.show()
# 结果更细腻了。
# 可以调节 n_estimator 参数和 random_state 参数,看分类器的表现怎么变化。
def getScore(n_est):
forest=RandomForestClassifier(n_estimators=n_est, random_state=3)
# 拟合
forest.fit(X_train, y_train)
# 输出打分
return [forest.score(X_train, y_train), forest.score(X_test, y_test)]
scores=[]
for i in range(1, 30):
scores.append( getScore(i) )
scores=np.array(scores)
# 画图
plt.plot(scores[:,0], label="trainning score")
plt.plot(scores[:,1], label="testing score")
plt.xlabel("n_estimator")
plt.ylabel("Score")
plt.legend()
plt.show()
# n_estimator=17 就是打分极限了
def getScore(seed):
forest=RandomForestClassifier(n_estimators=17, random_state=seed)
# 拟合
forest.fit(X_train, y_train)
# 输出打分
return [forest.score(X_train, y_train), forest.score(X_test, y_test)]
scores=[]
for i in range(1, 30):
scores.append( getScore(i) )
scores=np.array(scores)
# 画图
plt.plot(scores[:,0], label="trainning score")
plt.plot(scores[:,1], label="testing score")
plt.xlabel("random_state")
plt.ylabel("Score")
plt.legend()
plt.show()
# random_state 是随机数种子,影响随机取样、取特征等。
import os
os.getcwd()
# 下载 csv文件
# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
# 获取列名
# $ tail -n 14 adult.names | awk -F ":" '{print "\""$1"\""}'
'/data/wangjl/web/docs/jupyterlab'
import pandas as pd
data=pd.read_csv("data/adult.data", header=None, index_col=False,
names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"])
# 为了方便展示,选取部分列
data_lite=data[ ["age", "workclass","education","sex", "hours-per-week", "occupation", "income"] ]
print(data.shape)
print(data_lite.shape)
data_lite.head()
(32561, 15) (32561, 7)
age | workclass | education | sex | hours-per-week | occupation | income | |
---|---|---|---|---|---|---|---|
0 | 39 | State-gov | Bachelors | Male | 40 | Adm-clerical | <=50K |
1 | 50 | Self-emp-not-inc | Bachelors | Male | 13 | Exec-managerial | <=50K |
2 | 38 | Private | HS-grad | Male | 40 | Handlers-cleaners | <=50K |
3 | 53 | Private | 11th | Male | 40 | Handlers-cleaners | <=50K |
4 | 28 | Private | Bachelors | Female | 40 | Prof-specialty | <=50K |
import numpy as np
data_lite.loc[:,"workclass"].unique()
# ? 表示缺失值
array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'], dtype=object)
# 使用 get_dummies 处理数据,把分类变量变为 0/1 数值型的。
data_dummies=pd.get_dummies(data_lite)
print("样本原始特征:\n", list(data_lite.columns), "\n" )
print("虚拟变量特征:\n", list(data_dummies.columns) )
print( data_dummies.shape)
data_dummies.head()
#可见原来的7列已经扩展成 46 列了。
样本原始特征: ['age', 'workclass', 'education', 'sex', 'hours-per-week', 'occupation', 'income'] 虚拟变量特征: ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'sex_ Female', 'sex_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K'] (32561, 46)
age | hours-per-week | workclass_ ? | workclass_ Federal-gov | workclass_ Local-gov | workclass_ Never-worked | workclass_ Private | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | workclass_ State-gov | ... | occupation_ Machine-op-inspct | occupation_ Other-service | occupation_ Priv-house-serv | occupation_ Prof-specialty | occupation_ Protective-serv | occupation_ Sales | occupation_ Tech-support | occupation_ Transport-moving | income_ <=50K | income_ >50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 50 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 38 | 40 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 53 | 40 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 28 | 40 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 46 columns
## 把数据分配给X和y
features=data_dummies.loc[:, 'age':'occupation_ Transport-moving']
X=features.values
# 将收入大于50k作为预测目标
y=data_dummies["income_ >50K"].values
# 维度
print(X.shape, y.shape)
(32561, 44) (32561,)
# 导入拆分工具
from sklearn.model_selection import train_test_split
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
############
from sklearn import tree
# 设定决策树分类器的最大深度为5
clf=tree.DecisionTreeClassifier(max_depth=5)
# 拟合
clf.fit(X_train, y_train)
# 输出打分
print("training score: {:.3f}".format(clf.score(X_train, y_train)) )
print("testing score: {:.3f}".format(clf.score(X_test, y_test)) )
print()
############
# 导入随机森林分类器
from sklearn.ensemble import RandomForestClassifier
# 设定随机森林有6棵树
rfc=RandomForestClassifier(n_estimators=7, random_state=3)
# 拟合
rfc.fit(X_train, y_train)
# 输出打分
print("training score: {:.3f}".format(rfc.score(X_train, y_train)) )
print("testing score: {:.3f}".format(rfc.score(X_test, y_test)) )
training score: 0.803 testing score: 0.796 training score: 0.928 testing score: 0.784