In [1]:import numpy as np import matplotlib.pyplot as plt %matplotlib inline In [2]:from sklearn import datasets iris = datasets.load_iris() In [3]:print(iris.DESCR) Iris Plants Database ==================== Notes ----- Data Set Characteristics: :Number of Instances: 150 (50 in each of three classes) :Number of Attributes: 4 numeric, predictive attributes and the class :Attribute Information: - sepal length in cm - sepal width in cm - petal length in cm - petal width in cm - class: - Iris-Setosa - Iris-Versicolour - Iris-Virginica :Summary Statistics: ============== ==== ==== ======= ===== ==================== Min Max Mean SD Class Correlation ============== ==== ==== ======= ===== ==================== sepal length: 4.3 7.9 5.84 0.83 0.7826 sepal width: 2.0 4.4 3.05 0.43 -0.4194 petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) ============== ==== ==== ======= ===== ==================== :Missing Attribute Values: None :Class Distribution: 33.3% for each of 3 classes. :Creator: R.A. Fisher :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) :Date: July, 1988 This is a copy of UCI ML iris datasets. http://archive.ics.uci.edu/ml/datasets/Iris The famous Iris database, first used by Sir R.A Fisher This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. References ---------- - Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950). - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System Structure and Classification Rule for Recognition in Partially Exposed Environments". IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. PAMI-2, No. 1, 67-71. - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions on Information Theory, May 1972, 431-433. - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II conceptual clustering system finds 3 classes in the data. - Many, many more ... In [4]:iris.data[:10, :] Out[4]:array([[ 5.1, 3.5, 1.4, 0.2], [ 4.9, 3. , 1.4, 0.2], [ 4.7, 3.2, 1.3, 0.2], [ 4.6, 3.1, 1.5, 0.2], [ 5. , 3.6, 1.4, 0.2], [ 5.4, 3.9, 1.7, 0.4], [ 4.6, 3.4, 1.4, 0.3], [ 5. , 3.4, 1.5, 0.2], [ 4.4, 2.9, 1.4, 0.2], [ 4.9, 3.1, 1.5, 0.1]]) In [5]:iris.target Out[5]:array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) In [6]:x = iris.data[:, 0] y = iris.data[:, 2] plt.scatter(x, y) Out[6]: In [7]:from sklearn import linear_model model = linear_model.LinearRegression() model.fit(iris.data[:, 0].reshape(-1, 1), iris.data[:, 2]) Out[7]:LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) In [8]:xmin = x.min() xmax = x.max() def f(t): return model.coef_[0] * t + model.intercept_ plt.plot([xmin, xmax], [f(xmin), f(xmax)]) plt.scatter(x, y) Out[8]: In [9]:model.predict(6.5) Out[9]:array([ 4.97843135]) In [10]:training_data = iris.data[:, [1, 3]] target = iris.target for i, color in enumerate("rgb"): plt.scatter(training_data[target == i, 0], training_data[target == i, 1], color=color) In [11]:from sklearn import svm model = svm.SVC() model.fit(training_data, target) Out[11]:SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) In [12]:xmin = training_data[:, 0].min() xmax = training_data[:, 0].max() ymin = training_data[:, 1].min() ymax = training_data[:, 1].max() xs = np.linspace(xmin - 0.1, xmax + 0.1, 300) ys = np.linspace(ymin - 0.1, ymax + 0.1, 300) xmesh, ymesh = np.meshgrid(xs, ys) plt.contourf(xmesh, ymesh, model.predict(np.c_[xmesh.ravel(), ymesh.ravel()]).reshape(xmesh.shape), levels=[-0.5, 0.5, 1.5, 2.5, 3.5], colors=["r", "g", "b"]) for i, color in zip(range(3), "rgb"): plt.scatter(training_data[target == i, 0], training_data[target == i, 1], color=color, linewidth=1, edgecolor="k", marker="o") In [13]:from sklearn import model_selection X_train, X_test, y_train, y_test = model_selection.train_test_split( iris.data, iris.target, test_size=0.2, random_state=0) X_train.shape, y_train.shape, X_test.shape, y_test.shape Out[13]:((120, 4), (120,), (30, 4), (30,)) In [14]:model = svm.SVC() model.fit(X_train, y_train) Out[14]:SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) In [15]:model.score(X_test, y_test) Out[15]:1.0 In [16]:model = svm.SVC() scores = model_selection.cross_val_score(model, iris.data, iris.target, cv=5) scores, scores.mean() Out[16]:(array([ 0.96666667, 1. , 0.96666667, 0.96666667, 1. ]), 0.98000000000000009) In [17]:from sklearn import cluster training_data = iris.data[:, [1, 2]] model = cluster.KMeans(n_clusters=4) model.fit(training_data) Out[17]:KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0) In [18]:model.labels_ Out[18]:array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3, 2, 2, 2, 3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 2, 2, 2, 3, 2], dtype=int32) In [19]:for i, c in enumerate("rgbc"): data = training_data[model.labels_ == i, :] plt.scatter(data[:, 0], data[:, 1], color=c) In [20]:from sklearn import decomposition model = decomposition.PCA(n_components=2) model.fit(iris.data) Out[20]:PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False) In [21]:compressed = model.transform(iris.data) In [22]:for i, c in enumerate("rgb"): plt.scatter(compressed[iris.target == i, 0], compressed[iris.target == i, 1], color=c)