In [1]:import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:from sklearn import datasets
iris = datasets.load_iris()

In [3]:print(iris.DESCR)

Iris Plants Database
====================

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

    ============== ==== ==== ======= ===== ====================
                    Min  Max   Mean    SD   Class Correlation
    ============== ==== ==== ======= ===== ====================
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
    ============== ==== ==== ======= ===== ====================

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris datasets.
http://archive.ics.uci.edu/ml/datasets/Iris

The famous Iris database, first used by Sir R.A Fisher

This is perhaps the best known database to be found in the
pattern recognition literature.  Fisher's paper is a classic in the field and
is referenced frequently to this day.  (See Duda & Hart, for example.)  The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant.  One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.

References
----------
   - Fisher,R.A. "The use of multiple measurements in taxonomic problems"
     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
     Mathematical Statistics" (John Wiley, NY, 1950).
   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
     Structure and Classification Rule for Recognition in Partially Exposed
     Environments".  IEEE Transactions on Pattern Analysis and Machine
     Intelligence, Vol. PAMI-2, No. 1, 67-71.
   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
     on Information Theory, May 1972, 431-433.
   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
     conceptual clustering system finds 3 classes in the data.
   - Many, many more ...

In [4]:iris.data[:10, :]

Out[4]:array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1]])


In [5]:iris.target

Out[5]:array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])


In [6]:x = iris.data[:, 0]
y = iris.data[:, 2]
plt.scatter(x, y)

Out[6]:<matplotlib.collections.PathCollection at 0x7fc6f01a74e0>


In [7]:from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(iris.data[:, 0].reshape(-1, 1), iris.data[:, 2])

Out[7]:LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:xmin = x.min()
xmax = x.max()

def f(t): return model.coef_[0] * t + model.intercept_

plt.plot([xmin, xmax], [f(xmin), f(xmax)])
plt.scatter(x, y)

Out[8]:<matplotlib.collections.PathCollection at 0x7fc6edc40668>

In [9]:model.predict(6.5)

Out[9]:array([ 4.97843135])

In [10]:training_data = iris.data[:, [1, 3]]
target = iris.target
for i, color in enumerate("rgb"):
    plt.scatter(training_data[target == i, 0],
                training_data[target == i, 1], color=color)

In [11]:from sklearn import svm

model = svm.SVC()
model.fit(training_data, target)

Out[11]:SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:xmin = training_data[:, 0].min()
xmax = training_data[:, 0].max()
ymin = training_data[:, 1].min()
ymax = training_data[:, 1].max()
xs = np.linspace(xmin - 0.1, xmax + 0.1, 300)
ys = np.linspace(ymin - 0.1, ymax + 0.1, 300)
xmesh, ymesh = np.meshgrid(xs, ys)
plt.contourf(xmesh, ymesh, model.predict(np.c_[xmesh.ravel(), ymesh.ravel()]).reshape(xmesh.shape),
             levels=[-0.5, 0.5, 1.5, 2.5, 3.5], colors=["r", "g", "b"])
for i, color in zip(range(3), "rgb"):
    plt.scatter(training_data[target == i, 0],
                training_data[target == i, 1], color=color, linewidth=1, edgecolor="k", marker="o")

In [13]:from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

Out[13]:((120, 4), (120,), (30, 4), (30,))

In [14]:model = svm.SVC()
model.fit(X_train, y_train)

Out[14]:SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:model.score(X_test, y_test)

Out[15]:1.0

In [16]:model = svm.SVC()
scores = model_selection.cross_val_score(model, iris.data, iris.target, cv=5)
scores, scores.mean()

Out[16]:(array([ 0.96666667,  1.        ,  0.96666667,  0.96666667,  1.        ]),
 0.98000000000000009)

In [17]:from sklearn import cluster
training_data = iris.data[:, [1, 2]]
model = cluster.KMeans(n_clusters=4)
model.fit(training_data)

Out[17]:KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [18]:model.labels_

Out[18]:array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0,
       0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3, 2, 2,
       2, 3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3,
       2, 3, 3, 2, 2, 3, 3, 2, 2, 2, 3, 2], dtype=int32)

In [19]:for i, c in enumerate("rgbc"):
    data = training_data[model.labels_ == i, :]
    plt.scatter(data[:, 0], data[:, 1], color=c)

In [20]:from sklearn import decomposition
model = decomposition.PCA(n_components=2)
model.fit(iris.data)

Out[20]:PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [21]:compressed = model.transform(iris.data)

In [22]:for i, c in enumerate("rgb"):
    plt.scatter(compressed[iris.target == i, 0],
                compressed[iris.target == i, 1], color=c)