diff --git a/ml/clustering/clustering-manual-similarity.ipynb b/ml/clustering/clustering-manual-similarity.ipynb index f3ac805..75a3a6b 100644 --- a/ml/clustering/clustering-manual-similarity.ipynb +++ b/ml/clustering/clustering-manual-similarity.ipynb @@ -1,1217 +1,27 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Colab - Manual Similarity with Chocolates", - "version": "0.3.2", - "provenance": [], - "collapsed_sections": [ - "9EjQt_o9Xf_L", - "tj67XUzmjuNK" - ] - } - }, "cells": [ { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "9EjQt_o9Xf_L" - }, - "source": [ - "#### Copyright 2018 Google LLC." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "oXzTW-CnXf_Q", - "colab": {} - }, - "source": [ - "#@title\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "9NkysjxvKAli" - }, - "source": [ - "# Clustering with Manual Similarity Measure\n", - "\n", - "In this Colab, you will group chocolates in the\n", - "[Chocolate Bar Ratings](https://www.kaggle.com/rtatman/chocolate-bar-ratings)\n", - "dataset using the k-means clustering algorithm with a manual similarity measure. The dataset has ratings\n", - "of chocolate bars along with their cocoa percentage, bean type, bean origin,\n", - "maker name, and maker country. You will:\n", - "\n", - "* Load and clean the data.\n", - "* Process the data.\n", - "* Calculate similarity between pairs of chocolates.\n", - "* Cluster the chocolates using k-means.\n", - "* Check the clustering result using quality metrics.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "2X92CHu-KDOi" - }, - "source": [ - "# 1. Load and clean data\n", - "\n", - "Run the following cell to load and clean the chocolate dataset. You do not need to\n", - "understand the code. The first few rows of the dataset are displayed. Inspect\n", - "the features and their values." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "Sq-yxIzRO4R2", - "colab": {} - }, - "source": [ - "#@title Run cell to load and clean the dataset\n", - "%reset -f\n", - "import math\n", - "\n", - "from matplotlib import pyplot as plt\n", - "import numpy as np\n", - "import numpy.linalg as nla\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import altair as alt\n", - "import re\n", - "import pdb # for Python debugger\n", - "import sys\n", - "from os.path import join\n", - "\n", - "# Set the output display to have one digit for decimal places and limit it to\n", - "# printing 15 rows.\n", - "np.set_printoptions(precision=2)\n", - "pd.options.display.float_format = '{:.2f}'.format\n", - "pd.options.display.max_rows = 15\n", - "\n", - "choc_data = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/flavors_of_cacao.csv\", sep=\",\", encoding='latin-1')\n", - "\n", - "# We can rename the columns.\n", - "choc_data.columns = ['maker', 'specific_origin', 'reference_number', 'review_date', 'cocoa_percent', 'maker_location', 'rating', 'bean_type', 'broad_origin']\n", - "\n", - "# choc_data.dtypes\n", - "\n", - "# Replace empty/null values with \"Blend\"\n", - "choc_data['bean_type'] = choc_data['bean_type'].fillna('Blend')\n", - "\n", - "#@title Cast bean_type to string to remove leading 'u'\n", - "choc_data['bean_type'] = choc_data['bean_type'].astype(str)\n", - "choc_data['cocoa_percent'] = choc_data['cocoa_percent'].str.strip('%')\n", - "choc_data['cocoa_percent'] = pd.to_numeric(choc_data['cocoa_percent'])\n", - "\n", - "#@title Correct spelling mistakes, and replace city with country name\n", - "choc_data['maker_location'] = choc_data['maker_location']\\\n", - ".str.replace('Amsterdam', 'Holland')\\\n", - ".str.replace('U.K.', 'England')\\\n", - ".str.replace('Niacragua', 'Nicaragua')\\\n", - ".str.replace('Domincan Republic', 'Dominican Republic')\n", - "\n", - "# Adding this so that Holland and Netherlands map to the same country.\n", - "choc_data['maker_location'] = choc_data['maker_location']\\\n", - ".str.replace('Holland', 'Netherlands')\n", - "\n", - "def cleanup_spelling_abbrev(text):\n", - " replacements = [\n", - " ['-', ', '], ['/ ', ', '], ['/', ', '], ['\\(', ', '], [' and', ', '], [' &', ', '], ['\\)', ''],\n", - " ['Dom Rep|DR|Domin Rep|Dominican Rep,|Domincan Republic', 'Dominican Republic'],\n", - " ['Mad,|Mad$', 'Madagascar, '],\n", - " ['PNG', 'Papua New Guinea, '],\n", - " ['Guat,|Guat$', 'Guatemala, '],\n", - " ['Ven,|Ven$|Venez,|Venez$', 'Venezuela, '],\n", - " ['Ecu,|Ecu$|Ecuad,|Ecuad$', 'Ecuador, '],\n", - " ['Nic,|Nic$', 'Nicaragua, '],\n", - " ['Cost Rica', 'Costa Rica'],\n", - " ['Mex,|Mex$', 'Mexico, '],\n", - " ['Jam,|Jam$', 'Jamaica, '],\n", - " ['Haw,|Haw$', 'Hawaii, '],\n", - " ['Gre,|Gre$', 'Grenada, '],\n", - " ['Tri,|Tri$', 'Trinidad, '],\n", - " ['C Am', 'Central America'],\n", - " ['S America', 'South America'],\n", - " [', $', ''], [', ', ', '], [', ,', ', '], ['\\xa0', ' '],[',\\s+', ','],\n", - " [' Bali', ',Bali']\n", - " ]\n", - " for i, j in replacements:\n", - " text = re.sub(i, j, text)\n", - " return text\n", - "\n", - "choc_data['specific_origin'] = choc_data['specific_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)\n", - "\n", - "#@title Cast specific_origin to string\n", - "choc_data['specific_origin'] = choc_data['specific_origin'].astype(str)\n", - "\n", - "#@title Replace null-valued fields with the same value as for specific_origin\n", - "choc_data['broad_origin'] = choc_data['broad_origin'].fillna(choc_data['specific_origin'])\n", - "\n", - "#@title Clean up spelling mistakes and deal with abbreviations\n", - "choc_data['broad_origin'] = choc_data['broad_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)\n", - "\n", - "# Change 'Trinitario, Criollo' to \"Criollo, Trinitario\"\n", - "# Check with choc_data['bean_type'].unique()\n", - "choc_data.loc[choc_data['bean_type'].isin(['Trinitario, Criollo']),'bean_type'] = \"Criollo, Trinitario\"\n", - "# Confirm with choc_data[choc_data['bean_type'].isin(['Trinitario, Criollo'])]\n", - "\n", - "# Fix chocolate maker names\n", - "choc_data.loc[choc_data['maker']=='Shattel','maker'] = 'Shattell'\n", - "choc_data['maker'] = choc_data['maker'].str.replace(u'Na\\xef\\xbf\\xbdve','Naive')\n", - "\n", - "# Save the original column names\n", - "original_cols = choc_data.columns.values\n", - "\n", - "choc_data.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aIOACI88KI1k" - }, - "source": [ - "# 2. Preprocess Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "V9aJj0d2LdDG" - }, - "source": [ - "You will preprocess your data using the techniques described in\n", - "[Prepare Data](https://developers.google.com/machine-learning/clustering/prepare-data).\n", - "\n", - "Let's start with the feature `review_date`. If you assume that chocolate making\n", - "did not change over the 10 years of data, then `review_date` has no correlation\n", - "with the chocolate itself. You can safely ignore the feature. However, as a good data scientist, you should be curious about your data. Let's\n", - "plot the distribution for `review date` using a function from the Seaborn data visualization library. It looks like no one ate chocolate in 2009 and 2013. However, the\n", - "overall chocolate eating trend is positive and very encouraging. This is a good\n", - "time to eat some chocolate yourself!" - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "bJ-3B0OaKRyT", - "colab": {} - }, - "source": [ - "sns.distplot(choc_data['review_date'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "AYk5VR93feJm" - }, - "source": [ - "Plot the distribution for `rating`. Consider how you'd process this distribution. Then move ahead for the answer." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "8XQ5CZ0uFIlh", - "colab": {} - }, - "source": [ - "# check the distribution\n", - "sns.distplot(choc_data['rating'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "01RF1MRDAPjC" - }, - "source": [ - "The distribution for `rating` is roughly a Gaussian distribution. How are Gaussian distributions processed? You know it. Normalize the data." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "fl9HsbYIoJkl", - "colab": {} - }, - "source": [ - "# its a Gaussian! So, use z-score to normalize the data\n", - "choc_data['rating_norm'] = (choc_data['rating'] - choc_data['rating'].mean()\n", - " ) / choc_data['rating'].std()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "3fDeI_0cftzz" - }, - "source": [ - "Examine the distribution for `cocoa_percent` and consider how to process it. Then check below for the answer." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "nIQSWYYLGQeK", - "colab": {} - }, - "source": [ - "sns.distplot(choc_data['cocoa_percent'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zpJgcmP9Aedg" - }, - "source": [ - "The distribution for `cocoa_percent` is close enough to a Gaussian distribution. Normalize the data." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "1YSde7nloLt5", - "colab": {} - }, - "source": [ - "choc_data['cocoa_percent_norm'] = (\n", - " choc_data['cocoa_percent'] -\n", - " choc_data['cocoa_percent'].mean()) / choc_data['cocoa_percent'].std()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "qcGL7gHygH06" - }, - "source": [ - "Display the first few rows to check the normalization for `rating` and `cocoa_percent`." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "_-TKLnZQgIhR", - "colab": {} - }, - "source": [ - "choc_data.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zNn0n3Kqf5Ta" - }, - "source": [ - "You have the cocoa beans' country of origin in `broad_origin` and the chocolates' country of manufacture in `maker_location`. However, to calculate similarity, you need the longitude and latitude\n", - "of the countries. Luckily, this geographic information is available in another table on\n", - "developers.google.com! The following code downloads the Dataset Publishing Language (DSPL)\n", - "Countries table and joins it with our chocolate reviews table, using the country\n", - "name as the key. Note that you are approximating countries by the latitude and longitude of their centers.\n", - "\n", - "Display the first few rows to spot\n", - "check the processed data. Notice the newly created `maker_lat`, `maker_long`, `origin_lat`, and `origin_long` fields. Do the values in fields match your expectations? " - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "AdJyelmJJ-9h", - "colab": {} - }, - "source": [ - "#@title Run code to add latitude and longitude data\n", - "# Load lat long data\n", - "\n", - "countries_info = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/countries_lat_long.csv\", sep=\",\", encoding='latin-1')\n", - "\n", - "#Join the chocolate review and geographic information tables on maker country name\n", - "choc_data = pd.merge(\n", - " choc_data, countries_info, left_on=\"maker_location\", right_on=\"name\")\n", - "choc_data.rename(\n", - " columns={\n", - " \"longitude\": \"maker_long\",\n", - " \"latitude\": \"maker_lat\"\n", - " }, inplace=True)\n", - "choc_data.drop(\n", - " columns=[\"name\", \"country\"], inplace=True) # don't need this data\n", - "\n", - "#Join the chocolate review and geographic information tables on origin country name\n", - "choc_data = pd.merge(\n", - " choc_data, countries_info, left_on=\"broad_origin\", right_on=\"name\")\n", - "choc_data.rename(\n", - " columns={\n", - " \"longitude\": \"origin_long\",\n", - " \"latitude\": \"origin_lat\"\n", - " },\n", - " inplace=True)\n", - "choc_data.drop(\n", - " columns=[\"name\", \"country\"], inplace=True) # don't need this data\n", - "\n", - "choc_data.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "NBoiApVwgRb5" - }, - "source": [ - "Check the distribution for the latitudes and longitudes and consider how to process the distributions. Then check below for the answer." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "6yImioS8Lqwd", - "colab": {} - }, - "source": [ - "sns.distplot(choc_data['maker_lat'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Cg1t5qmngcCZ" - }, - "source": [ - "Since latitude and longitude don't follow a specific distribution, convert the\n", - "latitude and longitude information into quantiles. Display the last few rows to verify the quantile values." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "2RsiAsB3HRqu", - "colab": {} - }, - "source": [ - "numQuantiles = 20\n", - "colsQuantiles = ['maker_lat', 'maker_long', 'origin_lat', 'origin_long']\n", - "\n", - "def createQuantiles(dfColumn, numQuantiles):\n", - " return pd.qcut(dfColumn, numQuantiles, labels=False, duplicates='drop')\n", - "\n", - "\n", - "for string in colsQuantiles:\n", - " choc_data[string] = createQuantiles(choc_data[string], numQuantiles)\n", - " \n", - "choc_data.tail()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "fpT98u1JWfoE" - }, - "source": [ - "Quantile values range up to 20. Bring quantile values to the same scale as other feature data by scaling them to [0,1]." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "ypMg6cVxW0Uq", - "colab": {} - }, - "source": [ - "def minMaxScaler(numArr):\n", - " minx = np.min(numArr)\n", - " maxx = np.max(numArr)\n", - " numArr = (numArr - minx) / (maxx - minx)\n", - " return numArr\n", - "\n", - "\n", - "for string in colsQuantiles:\n", - " choc_data[string] = minMaxScaler(choc_data[string])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "m23IAig1WonY" - }, - "source": [ - "The features `maker` and `bean_type` are categorical features. Convert\n", - "categorical features into one-hot encoding." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "pZ29GSFWSTJA", - "colab": {} - }, - "source": [ - "# duplicate the \"maker\" feature since it's removed by one-hot encoding function\n", - "choc_data['maker2'] = choc_data['maker']\n", - "choc_data = pd.get_dummies(choc_data, columns=['maker2'], prefix=['maker'])\n", - "# similarly, duplicate the \"bean_type\" feature\n", - "choc_data['bean_type2'] = choc_data['bean_type']\n", - "choc_data = pd.get_dummies(choc_data, columns=['bean_type2'], prefix=['bean'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "k3pEOmohW9q5" - }, - "source": [ - "After clustering, when you interpret the results, the processed feature data is\n", - "hard to read. Save the original feature data in a new dataframe so you can\n", - "reference it later. Keep only the processed data in `choc_data`." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "3pJutwixXpfy", - "colab": {} - }, - "source": [ - "# Split dataframe into two frames: Original data and data for clustering\n", - "choc_data_backup = choc_data.loc[:, original_cols].copy(deep=True)\n", - "choc_data.drop(columns=original_cols, inplace=True)\n", - "\n", - "# get_dummies returned ints for one-hot encoding but we want floats so divide by\n", - "# 1.0\n", - "# Note: In the latest version of \"get_dummies\", you can set \"dtype\" to float\n", - "choc_data = choc_data / 1.0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "TU1UQNP2pIcT" - }, - "source": [ - "Inspect the last few records to ensure your precious chocolate data is looking\n", - "good! Remember that `choc_data` only shows columns with processed data because the columns holding the original data were moved to `choc_data_backup`." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "nwWcnf4IpF7V", - "colab": {} - }, - "source": [ - "choc_data.tail()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "E2HuxR-UcDOw" - }, - "source": [ - "# 3. Calculate Manual Similarity" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "G7Fi6dMBTL1g" - }, - "source": [ - "You've worked hard to process the data! Now calculating similarity between a\n", - "pair of chocolates is simple because all the features are numeric and in the\n", - "same range. For any two chocolates, simply find the root mean square error\n", - "(RMSE) of all features.\n", - "\n", - "First run this code to define the similarity function." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "zBGUouTEcAz3", - "colab": {} - }, - "source": [ - "def getSimilarity(obj1, obj2):\n", - " len1 = len(obj1.index)\n", - " len2 = len(obj2.index)\n", - " if not (len1 == len2):\n", - " print \"Error: Compared objects must have same number of features.\"\n", - " sys.exit()\n", - " return 0\n", - " else:\n", - " similarity = obj1 - obj2\n", - " similarity = np.sum((similarity**2.0) / 10.0)\n", - " similarity = 1 - math.sqrt(similarity)\n", - " return similarity" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ybig-jrATerQ" - }, - "source": [ - "Now calculate the similarity between the first chocolate and the next 4\n", - "chocolates. Verify the calculated similarity against your intuitive expectations\n", - "by comparing the calculated similarity to the actual feature data shown in the\n", - "next cell.\n", - "\n", - "If you're curious about similarities between other chocolates, do modify the\n", - "code below and take a look!" - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "Tylgxe-FM6NP", - "colab": {} - }, - "source": [ - "choc1 = 0 #@param\n", - "chocsToCompare = [1, 4] #@param\n", - "\n", - "print \"Similarity between chocolates \" + str(choc1) + \" and ...\"\n", - "\n", - "for ii in range(chocsToCompare[0], chocsToCompare[1] + 1):\n", - " print str(ii) + \": \" + str(\n", - " getSimilarity(choc_data.loc[choc1], choc_data.loc[ii]))\n", - "\n", - "print \"\\n\\nFeature data for chocolate \" + str(choc1)\n", - "print choc_data_backup.loc[choc1:choc1, :]\n", - "print \"\\n\\nFeature data for compared chocolates \" + str(chocsToCompare)\n", - "print choc_data_backup.loc[chocsToCompare[0]:chocsToCompare[1], :]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ImOGD5GJ8Ia7" - }, - "source": [ - "# 4. Cluster Chocolate Dataset\n", - "\n", - "We're ready to cluster the chocolates! Run the code to setup the k-means\n", - "clustering functions. You do not need to understand the code.\n", - "\n", - "**Note**: If you're following the self study, then before running the rest of\n", - "this Colab, read the sections on\n", - "[k-means](https://developers.google.com/machine-learning/clustering/algorithm/run-algorithm)\n", - "and\n", - "[quality metrics](https://developers.google.com/machine-learning/clustering/interpret)." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "eExms-TP8Hn6", - "colab": {} - }, - "source": [ - "#@title Run cell to setup functions\n", - "def dfSimilarity(df, centroids):\n", - " ### dfSimilarity = Calculate similarities for dataframe input\n", - " ### We need to calculate ||a-b||^2 = |a|^2 + |b|^2 - 2*|a|*|b|\n", - " ### Implement this with matrix operations\n", - " ### See the Appendix for further explanation\n", - " numPoints = len(df.index)\n", - " numCentroids = len(centroids.index)\n", - " ## Strictly speaking, we don't need to calculate the norm of points\n", - " # because it adds a constant bias to distances\n", - " # But calculating it so that the similarity doesn't go negative\n", - " # And that we expect similarities in [0,1] which aids debugging\n", - " pointNorms = np.square(nla.norm(df, axis=1))\n", - " pointNorms = np.reshape(pointNorms, [numPoints, 1])\n", - " ## Calculate the norm of centroids\n", - " centroidNorms = np.square(nla.norm(centroids, axis=1))\n", - " centroidNorms = np.reshape(centroidNorms, (1, numCentroids))\n", - " ## Calculate |a|^2 + |b|^2 - 2*|a|*|b|\n", - " similarities = pointNorms + centroidNorms - 2.0 * np.dot(\n", - " df, np.transpose(centroids))\n", - " # Divide by the number of features\n", - " # Which is 10 because the one-hot encoding means the \"Maker\" and \"Bean\" are\n", - " # weighted twice\n", - " similarities = similarities / 10.0\n", - " # numerical artifacts lead to negligible but negative values that go to NaN on the root\n", - " similarities = similarities.clip(min=0.0)\n", - " # Square root since it's ||a-b||^2\n", - " similarities = np.sqrt(similarities)\n", - " return similarities\n", - "\n", - "\n", - "def initCentroids(df, k, feature_cols):\n", - " # Pick 'k' examples are random to serve as initial centroids\n", - " limit = len(df.index)\n", - " centroids_key = np.random.randint(0, limit - 1, k)\n", - " centroids = df.loc[centroids_key, feature_cols].copy(deep=True)\n", - " # the indexes get copied over so reset them\n", - " centroids.reset_index(drop=True, inplace=True)\n", - " return centroids\n", - "\n", - "\n", - "def pt2centroid(df, centroids, feature_cols):\n", - " ### Calculate similarities between all points and centroids\n", - " ### And assign points to the closest centroid + save that distance\n", - " numCentroids = len(centroids.index)\n", - " numExamples = len(df.index)\n", - " # dfSimilarity = Calculate similarities for dataframe input\n", - " dist = dfSimilarity(df.loc[:, feature_cols], centroids.loc[:, feature_cols])\n", - " df.loc[:, 'centroid'] = np.argmin(dist, axis=1) # closest centroid\n", - " df.loc[:, 'pt2centroid'] = np.min(dist, axis=1) # minimum distance\n", - " return df\n", - "\n", - "\n", - "def recomputeCentroids(df, centroids, feature_cols):\n", - " ### For every centroid, recompute it as an average of the points\n", - " ### assigned to it\n", - " numCentroids = len(centroids.index)\n", - " for cen in range(numCentroids):\n", - " dfSubset = df.loc[df['centroid'] == cen,\n", - " feature_cols] # all points for centroid\n", - " if not (dfSubset.empty): # if there are points assigned to the centroid\n", - " clusterAvg = np.sum(dfSubset) / len(dfSubset.index)\n", - " centroids.loc[cen] = clusterAvg\n", - " return centroids\n", - "\n", - "\n", - "def kmeans(df, k, feature_cols, verbose):\n", - " flagConvergence = False\n", - " maxIter = 100\n", - " iter = 0 # ensure kmeans doesn't run for ever\n", - " centroids = initCentroids(df, k, feature_cols)\n", - " while not (flagConvergence):\n", - " iter += 1\n", - " #Save old mapping of points to centroids\n", - " oldMapping = df['centroid'].copy(deep=True)\n", - " # Perform k-means\n", - " df = pt2centroid(df, centroids, feature_cols)\n", - " centroids = recomputeCentroids(df, centroids, feature_cols)\n", - " # Check convergence by comparing [oldMapping, newMapping]\n", - " newMapping = df['centroid']\n", - " flagConvergence = all(oldMapping == newMapping)\n", - " if verbose == 1:\n", - " print 'Total distance:' + str(np.sum(df['pt2centroid']))\n", - " if (iter > maxIter):\n", - " print 'k-means did not converge! Reached maximum iteration limit of ' \\\n", - " + str(maxIter) + '.'\n", - " sys.exit()\n", - " return\n", - " print 'k-means converged for ' + str(k) + ' clusters' + \\\n", - " ' after ' + str(iter) + ' iterations!'\n", - " return [df, centroids]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "-KnRLWvw1rJ9" - }, - "source": [ - "Run the cell to cluster the chocolate dataset, where `k` is the number of\n", - "clusters.\n", - "\n", - "On every iteration of k-means, the output shows how the sum of distances from all examples to their centroids reduces, such that k-means always converges. The following table shows the data for the first few chocolates. On the extreme right of the table, check the assigned centroid for each example in the `centroid` column and the distance from the example to its centroid in the `pt2centroid` column." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "AKDwhN9J1PhU", - "colab": {} - }, - "source": [ - "k = 30 #@param\n", - "\n", - "feature_cols = choc_data.columns.values # save original columns\n", - "# initialize every point to an impossible value, the k+1 cluster\n", - "choc_data['centroid'] = k\n", - "# init the point to centroid distance to an impossible value \"2\" (>1)\n", - "choc_data['pt2centroid'] = 2\n", - "[choc_data, centroids] = kmeans(choc_data, k, feature_cols, 1)\n", - "print(\"Data for the first few chocolates, with 'centroid' and 'pt2centroid' on\"\n", - " ' the extreme right:')\n", - "choc_data.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "bfFShL6wqa-9" - }, - "source": [ - "## Inspect Clustering Result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "13TnsPz23xOU" - }, - "source": [ - "Inspect the chocolates in different clusters by changing the parameter `clusterNumber`\n", - "in the next cell and running the cell. Consider these questions as you inspect the clusters:\n", - "\n", - "* Are the clusters meaningful?\n", - "* Do the clusters weight certain features more than others? Why?\n", - "* Does changing the number of clusters make the clusters more or less\n", - " meaningful?\n", - "\n", - "After considering these questions, expand the next section for a discussion of clustering results." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "NHWgGmpyux39", - "colab": {} - }, - "source": [ - "clusterNumber = 7 #@param\n", - "choc_data_backup.loc[choc_data['centroid'] == clusterNumber, :]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MJtuP9w5jJHq" - }, - "source": [ - "### Solution: Discussion of Clustering Results\n", - "\n", - "Click below for the answer." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "gxiPD8g_jShi" - }, - "source": [ - "**Discussion**: The clustering result does unintentionally weight certain\n", - "features more than others.\n", - "\n", - "That's because a given chocolate maker will have the same country of\n", - "manufacture, which leads to mutual information between the features `maker`,\n", - "`maker_lat`, and `maker_long`. Similarly, suppose each country tends to grow a\n", - "particular type of bean, then there is mutual information between `origin_lat`,\n", - "`origin_long`, and `bean_type`.\n", - "\n", - "As a result, features that share mutual information are effectively weighted\n", - "more strongly than uncorrelated features. The solution is to use a supervised\n", - "similarity measure because the DNN eliminates correlated information. See\n", - "[k-means advantages and disadvantages](https://developers.google.com/machine-learning/clustering/algorithm/advantages-disadvantages).\n", - "\n", - "Now consider the one-hot encoding. Chocolates that have different makers will\n", - "differ by 1 in two columns. Similarly, chocolates that are made of different\n", - "bean types will differ by 1 in two features. Therefore, differences in makers\n", - "and bean types will be weighted twice as much as other features. This uneven\n", - "weighting skews the clustering result." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Z1eW0PlG57Zs" - }, + "metadata": {}, "source": [ - "# 5. Quality Metrics for Clusters\n", - "\n", - "For the clusters, let's calculate the metrics discussed in\n", - "[Interpret Results](https://developers.google.com/machine-learning/clustering/interpret).\n", - "Read that course content before starting this code section.\n", - "\n", - "Run the next cell to set up functions." + "### This Colab was deprecated July 2024." ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "i9Y2H-nR56C3", - "colab": {} - }, - "source": [ - "#@title Run cell to set up functions { display-mode: \"form\" }\n", - "def clusterCardinality(df):\n", - " k = np.max(df['centroid']) + 1\n", - " k = k.astype(int)\n", - " print 'Number of clusters:' + str(k)\n", - " clCard = np.zeros(k)\n", - " for kk in range(k):\n", - " clCard[kk] = np.sum(df['centroid'] == kk)\n", - " clCard = clCard.astype(int)\n", - " # print \"Cluster Cardinality:\"+str(clCard)\n", - " plt.figure()\n", - " plt.bar(range(k), clCard)\n", - " plt.title('Cluster Cardinality')\n", - " plt.xlabel('Cluster Number: ' + str(0) + ' to ' + str(k - 1))\n", - " plt.ylabel('Points in Cluster')\n", - " return clCard\n", - "\n", - "\n", - "def clusterMagnitude(df):\n", - " k = np.max(df['centroid']) + 1\n", - " k = k.astype(int)\n", - " cl = np.zeros(k)\n", - " clMag = np.zeros(k)\n", - " for kk in range(k):\n", - " idx = np.where(df['centroid'] == kk)\n", - " idx = idx[0]\n", - " clMag[kk] = np.sum(df.loc[idx, 'pt2centroid'])\n", - " # print \"Cluster Magnitude:\",clMag #precision set using np pref\n", - " plt.figure()\n", - " plt.bar(range(k), clMag)\n", - " plt.title('Cluster Magnitude')\n", - " plt.xlabel('Cluster Number: ' + str(0) + ' to ' + str(k - 1))\n", - " plt.ylabel('Total Point-to-Centroid Distance')\n", - " return clMag\n", - "\n", - "\n", - "def plotCardVsMag(clCard, clMag):\n", - " plt.figure()\n", - " plt.scatter(clCard, clMag)\n", - " plt.xlim(xmin=0)\n", - " plt.ylim(ymin=0)\n", - " plt.title('Magnitude vs Cardinality')\n", - " plt.ylabel('Magnitude')\n", - " plt.xlabel('Cardinality')\n", - "\n", - "\n", - "def clusterQualityMetrics(df):\n", - " clCard = clusterCardinality(df)\n", - " clMag = clusterMagnitude(df)\n", - " plotCardVsMag(clCard, clMag)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "1nLYPlv4ejwD" - }, - "source": [ - "Calculate the following metrics by running the next cell:\n", - "\n", - "* cardinality of your clusters\n", - "* magnitude of your clusters\n", - "* cardinality vs Magnitude\n", - "\n", - "From the plots, find clusters that are outliers and clusters that are average.\n", - "Compare the examples in outlier clusters versus those in average clusters by\n", - "changing `clusterNumber` in the previous section." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "3llKFtEpeiZ_", - "colab": {} - }, - "source": [ - "clusterQualityMetrics(choc_data)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "SBa0k9KK2PAt" - }, - "source": [ - "## Find Optimum Number of Clusters\n", - "\n", - "You want to find the right number of clusters as you did in the previous\n", - "programming exercise. For details, read \"*Step Three: Optimum Number of\n", - "Clusters*\" on the page\n", - "[Interpret Results](https://developers.google.com/machine-learning/clustering/interpret).\n", - "\n", - "Run the code below. Does the plot follow the form shown on \"*Interpret Results*\"? What's the\n", - "optimum number of clusters? Experiment with the parameters below if necessary. After considering the questions, expand the next section for a discussion." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "LD9RQUIWjfwS" - }, - "source": [ - "### Solution: Discussion on optimum number of clusters\n", - "\n", - "Click below for the solution." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "-df7QnPlhuIN", - "colab": {} - }, - "source": [ - "# Plot loss vs number of clusters\n", - "def lossVsClusters(kmin, kmax, kstep, choc_data):\n", - " kmax += 1 # include kmax-th cluster in range\n", - " kRange = range(kmin, kmax, kstep)\n", - " loss = np.zeros(len(kRange))\n", - " lossCtr = 0\n", - " for kk in kRange:\n", - " [choc_data, centroids] = kmeans(choc_data, kk, feature_cols, 0)\n", - " loss[lossCtr] = np.sum(choc_data['pt2centroid'])\n", - " lossCtr += 1\n", - " plt.scatter(kRange, loss)\n", - " plt.title('Loss vs Clusters Used')\n", - " plt.xlabel('Number of clusters')\n", - " plt.ylabel('Total Point-to-Centroid Distance')\n", - "\n", - "\n", - "kmin = 5 # @param\n", - "kmax = 80 # @param\n", - "kstep = 2 # @param\n", - "lossVsClusters(kmin, kmax, kstep, choc_data)" + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "9EjQt_o9Xf_L", + "tj67XUzmjuNK" ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "0wYfwKGcVT-R" - }, - "source": [ - "**Discussion**: The ideal plot of loss vs clusters has a clear inflection point beyond which the decrease in loss flattens out. Here, the plot lacks an obvious inflection point. However, the decrease in loss evens out twice, at approximately `k = 15`\n", - "and `k = 35`, suggesting that `k` has optimum values close to 15 and 35. Note that your plot can differ due to the inherent randomness in the k-means algorithm.\n", - "\n", - "You\n", - "typically see a plot with a clear inflection point plot when the data has naturally clumped\n", - "examples. When data doesn't have natural clumps, this plot only hints\n", - "as to the optimum value for `k`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tj67XUzmjuNK" - }, - "source": [ - "## Discussion\n", - "\n", - "On the page\n", - "[Supervised Similarity Measure](https://developers.google.com/machine-learning/clustering/similarity/supervised-similarity),\n", - "read the \"*Comparison of Manual and Supervised Measures*\". Try to connect the description of a manual similarity measure to what your learned from this codelab. Then click below to view the discussion. Lastly, **keep this Colab open** to compare the results with the next Colab that uses a supervised similarity measure." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "pVEAjXut0uw7" - }, - "source": [ - "The Colab demonstrates the following characteristics of a manual similarity metric:\n", - "\n", - "* **Does not eliminate redundant information in correlated features**. As\n", - " discussed in this [section](#scrollTo=MJtuP9w5jJHq), our manual similarity\n", - " measure did not eliminate redundant information between features.\n", - "* **Provides insight into calculated similarities**. Looking at the clustering\n", - " results, you could see how the maker location and bean origin had a greater\n", - " influence on the clustering result. You saw how the one-hot encoding\n", - " resulted in weighting maker and bean type twice as much as other features.\n", - "* **Suitable for small datasets with few features**. Yes, you could easily\n", - " construct a manual similarity measure for the chocolate dataset since it has\n", - " less than two thousand examples and only nine features.\n", - "* **Not suitable for large datasets with many features**. If the chocolate dataset\n", - " had dozens of features and many thousands of examples, it would be difficult\n", - " to construct a correct similarity measure and then verify the similarity\n", - " measure across the dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aZ12kbwD4Qtu" - }, - "source": [ - "# Appendix: Calculating Point to Centroid Distance for Large Datasets\n", - "\n", - "On every iteration of k-means, your code calculates the distance between every\n", - "point and every centroid. When you cluster large numbers of points using many centroids, you must implement\n", - "this operation efficiently. Let's see how.\n", - "\n", - "Assume you have a vector \"P\" for a point, and another vector \"C\" for a centroid.\n", - "You need to calculate $||P-C||^2$. Mathematically:\n", - "\n", - "$$||P-C||^2 = |P|^2 + |C|^2 - 2 \\cdot P \\cdot C$$\n", - "\n", - "The code below generalizes this operation to matrices that represent arbitrary\n", - "numbers of points and centroids. Using the code, you can calculate the\n", - "point-centroid distances for all combinations of your points and centroids." - ] + "name": "Colab - Manual Similarity with Chocolates", + "provenance": [], + "version": "0.3.2" }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "T6hHN2bCKi7k", - "colab": {} - }, - "source": [ - "#@title\n", - "# Calculate distances between \"A\" point and \"B\" centroids to return\n", - "# axb array where a_i,b_j distance is at (i,j) position\n", - "A = np.array([[1, 2, 3],\\\n", - " [3, 1, 2],\\\n", - " [0, 0, 0]])\n", - "A = A / np.max(A)\n", - "B = np.array([[4, 5, 6],\\\n", - " [6, 6, 6]])\n", - "B = B / np.max(B)\n", - "numPoints = A.shape[0]\n", - "numCentroids = B.shape[0]\n", - "pointNorms = np.reshape(nla.norm(A, axis=1)**2.0, [numPoints, 1])\n", - "centroidNorms = np.reshape(nla.norm(B, axis=1)**2.0, (1, numCentroids))\n", - "print \"\"\"Distance matrix of size 'p' by 'c' where Distance between \n", - "point 'p' and centroid 'c' is at (p,c).\"\"\"\n", - "print pointNorms + centroidNorms - 2.0 * np.dot(A, np.transpose(B))" - ], - "execution_count": 0, - "outputs": [] + "language_info": { + "name": "python" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ml/clustering/clustering-supervised-similarity.ipynb b/ml/clustering/clustering-supervised-similarity.ipynb index e3b3a0a..703d2d2 100644 --- a/ml/clustering/clustering-supervised-similarity.ipynb +++ b/ml/clustering/clustering-supervised-similarity.ipynb @@ -1,1117 +1,27 @@ { - "nbformat": 4, - "nbformat_minor": 0, + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### This Colab was deprecated July 2024." + ] + } + ], "metadata": { "colab": { - "name": "Clustering - Supervised Similarity with Chocolates", - "provenance": [], "collapsed_sections": [ "9EjQt_o9Xf_L", "MJtuP9w5jJHq" - ] + ], + "name": "Clustering - Supervised Similarity with Chocolates", + "provenance": [] }, "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "display_name": "Python 3", + "name": "python3" } }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "9EjQt_o9Xf_L" - }, - "source": [ - "#### Copyright 2018 Google LLC." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "both", - "colab_type": "code", - "id": "oXzTW-CnXf_Q", - "colab": {} - }, - "source": [ - "#@title\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "9NkysjxvKAli" - }, - "source": [ - "# Supervised Similarity Measure\n", - "We'll cluster chocolates in the [Chocolate Bar Ratings](https://www.kaggle.com/rtatman/chocolate-bar-ratings) dataset using k-means with a supervised similarity measure. The dataset has ratings\n", - "of chocolate bars along with their cocoa percentage, bean type, bean origin,\n", - "maker name, and maker country. You will:\n", - "\n", - "* Load and clean the data.\n", - "* Process the data.\n", - "* Generate embeddings by training a DNN.\n", - "* Cluster the chocolates using k-means.\n", - "* Check the clustering result using quality metrics.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "2X92CHu-KDOi" - }, - "source": [ - "# 1. Load and clean data\n", - "Run the section below to load and clean the dataset. You do not need to understand the code. The code displays data for the first few chocolates." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "Sq-yxIzRO4R2", - "colab": {} - }, - "source": [ - "#@title Run to load and clean the dataset\n", - "%reset -f\n", - "from __future__ import print_function\n", - "\n", - "import math\n", - "import numpy as np\n", - "import numpy.linalg as nla\n", - "import pandas as pd\n", - "import re\n", - "import six\n", - "from os.path import join\n", - "from matplotlib import pyplot as plt\n", - "\n", - "import tensorflow.compat.v1 as tf\n", - "tf.disable_v2_behavior()\n", - "\n", - "\n", - "# Set the output display to have one digit for decimal places and limit it to\n", - "# printing 15 rows.\n", - "pd.options.display.float_format = '{:.2f}'.format\n", - "pd.options.display.max_rows = 15\n", - "\n", - "choc_data = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/flavors_of_cacao.csv\", sep=\",\", encoding='latin-1')\n", - "\n", - "# We can rename the columns.\n", - "choc_data.columns = ['maker', 'specific_origin', 'reference_number', 'review_date', 'cocoa_percent', 'maker_location', 'rating', 'bean_type', 'broad_origin']\n", - "\n", - "# choc_data.dtypes\n", - "\n", - "# Replace empty/null values with \"Blend\"\n", - "choc_data['bean_type'] = choc_data['bean_type'].fillna('Blend')\n", - "\n", - "#@title Cast bean_type to string to remove leading 'u'\n", - "choc_data['bean_type'] = choc_data['bean_type'].astype(str)\n", - "choc_data['cocoa_percent'] = choc_data['cocoa_percent'].str.strip('%')\n", - "choc_data['cocoa_percent'] = pd.to_numeric(choc_data['cocoa_percent'])\n", - "\n", - "#@title Correct spelling mistakes, and replace city with country name\n", - "choc_data['maker_location'] = choc_data['maker_location']\\\n", - ".str.replace('Amsterdam', 'Holland')\\\n", - ".str.replace('U.K.', 'England')\\\n", - ".str.replace('Niacragua', 'Nicaragua')\\\n", - ".str.replace('Domincan Republic', 'Dominican Republic')\n", - "\n", - "# Adding this so that Holland and Netherlands map to the same country.\n", - "choc_data['maker_location'] = choc_data['maker_location']\\\n", - ".str.replace('Holland', 'Netherlands')\n", - "\n", - "def cleanup_spelling_abbrev(text):\n", - " replacements = [\n", - " ['-', ', '], ['/ ', ', '], ['/', ', '], ['\\(', ', '], [' and', ', '], [' &', ', '], ['\\)', ''],\n", - " ['Dom Rep|DR|Domin Rep|Dominican Rep,|Domincan Republic', 'Dominican Republic'],\n", - " ['Mad,|Mad$', 'Madagascar, '],\n", - " ['PNG', 'Papua New Guinea, '],\n", - " ['Guat,|Guat$', 'Guatemala, '],\n", - " ['Ven,|Ven$|Venez,|Venez$', 'Venezuela, '],\n", - " ['Ecu,|Ecu$|Ecuad,|Ecuad$', 'Ecuador, '],\n", - " ['Nic,|Nic$', 'Nicaragua, '],\n", - " ['Cost Rica', 'Costa Rica'],\n", - " ['Mex,|Mex$', 'Mexico, '],\n", - " ['Jam,|Jam$', 'Jamaica, '],\n", - " ['Haw,|Haw$', 'Hawaii, '],\n", - " ['Gre,|Gre$', 'Grenada, '],\n", - " ['Tri,|Tri$', 'Trinidad, '],\n", - " ['C Am', 'Central America'],\n", - " ['S America', 'South America'],\n", - " [', $', ''], [', ', ', '], [', ,', ', '], ['\\xa0', ' '],[',\\s+', ','],\n", - " [' Bali', ',Bali']\n", - " ]\n", - " for i, j in replacements:\n", - " text = re.sub(i, j, text)\n", - " return text\n", - "\n", - "choc_data['specific_origin'] = choc_data['specific_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)\n", - "\n", - "#@title Cast specific_origin to string\n", - "choc_data['specific_origin'] = choc_data['specific_origin'].astype(str)\n", - "\n", - "#@title Replace null-valued fields with the same value as for specific_origin\n", - "choc_data['broad_origin'] = choc_data['broad_origin'].fillna(choc_data['specific_origin'])\n", - "\n", - "#@title Clean up spelling mistakes and deal with abbreviations\n", - "choc_data['broad_origin'] = choc_data['broad_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)\n", - "\n", - "# Change 'Trinitario, Criollo' to \"Criollo, Trinitario\"\n", - "# Check with choc_data['bean_type'].unique()\n", - "choc_data.loc[choc_data['bean_type'].isin(['Trinitario, Criollo']),'bean_type'] = \"Criollo, Trinitario\"\n", - "# Confirm with choc_data[choc_data['bean_type'].isin(['Trinitario, Criollo'])]\n", - "\n", - "# Fix chocolate maker names\n", - "choc_data.loc[choc_data['maker']=='Shattel','maker'] = 'Shattell'\n", - "choc_data['maker'] = choc_data['maker'].str.replace(u'Na\\xef\\xbf\\xbdve','Naive')\n", - "\n", - "choc_data.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Gtw73LZKeDux" - }, - "source": [ - "# 2. Process Data\n", - "Because you're using a DNN, you do not need to manually process the data. The DNN transforms the data for us. However, if possible, you should remove features that could distort the similarity calculation. Here, the features `review_date` and `reference_number` are not correlated with similarity. That is, chocolates that are reviewed closer together in time are not more or less similar than chocolates reviewed further apart. Remove these two features by running the following code." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "BQKj_NVSecDx", - "colab": {} - }, - "source": [ - "choc_data.drop(columns=['review_date','reference_number'],inplace=True)\n", - "choc_data.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "UnH92sD8e1ri" - }, - "source": [ - "# 3. Generate Embeddings from DNN\n", - "\n", - "We're ready to generate embeddings by training the DNN on the feature data. This section draws on concepts discussed on the page [Supervised Similarity Measure](https://developers.google.com/machine-learning/clustering/similarity/supervised-similarity).\n", - "\n", - "Run the section below to set up functions to train the DNN that generates embeddings. You do not need to understand the code." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "S1IyjxoUv57M", - "colab": {} - }, - "source": [ - "#@title Functions to Build and Train a Similarity DNN Model\n", - "\n", - "class SimilarityModel(object):\n", - " \"\"\"Class to build, train, and inspect a Similarity Model.\n", - "\n", - " This class builds a deep neural network that maps a dataset of entities\n", - " with heterogenous features to an embedding space.\n", - " Given a dataset as a pandas dataframe, determine the model by specifying\n", - " the set of features used as input and as labels to the DNN, and the\n", - " size of each hidden layer. The data is mapped to the embedding space\n", - " in the last hidden layer.\n", - " \n", - " To build an auto-encoder, make the set of output features identical to the set\n", - " of input features. Alternatively, build a predictor by using a single feature\n", - " as the label. When using a single feature as a label, ensure\n", - " this feature is removed from the input, or add at least\n", - " one hidden layer of a sufficiently low dimension such that the model cannot\n", - " trivially learn the label.\n", - " Caveat: The total loss being minimized is a simple sum of losses for each\n", - " output label (plus the regularization). If the output feature set combines\n", - " sparse and dense features, the total loss is a sum of cross-entropy soft-max\n", - " losses with root mean squared error losses, potentially in different scales,\n", - " which could emphasis some output labels more than others.\n", - " \"\"\"\n", - "\n", - " def __init__(self,\n", - " dataframe,\n", - " input_feature_names,\n", - " output_feature_names,\n", - " dense_feature_names,\n", - " sparse_input_feature_embedding_dims,\n", - " hidden_dims=[32],\n", - " l2_regularization=0.0,\n", - " use_bias=True,\n", - " batch_size=100,\n", - " inspect=False):\n", - " \"\"\"Build a similarity model.\n", - "\n", - " Args:\n", - " dataframe: the pandas dataframe used to train and validate the model.\n", - " input_feature_names: list of strings, names of input feature columns.\n", - " output_feature_names: list of strings, names of output feature columns.\n", - " dense_feature_names: list of strings, names of feature columns that are\n", - " treated as dense. All other feature columns are treated as sparse.\n", - " sparse_input_feature_embedding_dims: dictionary that maps feature names to\n", - " ints, expressing the embedding dimension of each input feature. Any\n", - " sparse feature in input_feature_names must be in this dictionary.\n", - " hidden_dims: list of ints, dimensions of each hidden layer. These hidden\n", - " layers are not counting the first layer which is a concatenation of the\n", - " input embeddings and the dense input features. Hence, this list can be\n", - " empty, in which case the outputs of the network are directly connected\n", - " to the input embeddings and/or dense inputs.\n", - " use_bias: bool, if true, add a bias term to each hidden layer.\n", - " batch_size: int, batch size.\n", - " inspect: bool, if true, add each tensor of the model to the list of\n", - " tensors that are inspected.\n", - " \"\"\"\n", - " used_feature_names = tuple(\n", - " set(input_feature_names).union(output_feature_names))\n", - " sparse_feature_names = tuple(\n", - " set(used_feature_names).difference(dense_feature_names))\n", - " # Dictionary mapping each sparse feature column to its vocabulary.\n", - " ### sparse_feature_vocabs = { 'maker': [u'A. Morin', u'AMMA', ...], ... }\n", - " sparse_feature_vocabs = {\n", - " sfn: sorted(list(set(choc_data[sfn].values)))\n", - " for sfn in sparse_feature_names\n", - " }\n", - "\n", - " # Sparse output features are mapped to ids via tf.feature_to_id, hence\n", - " # we need key-id pairs for these vocabularies.\n", - " sparse_output_feature_names = (\n", - " tuple(set(sparse_feature_names).intersection(output_feature_names)))\n", - " keys_and_values = {}\n", - " for fn in sparse_output_feature_names:\n", - " keys = tf.constant(\n", - " sparse_feature_vocabs[fn],\n", - " dtype=tf.string,\n", - " name='{}_vocab_keys'.format(fn))\n", - " values = tf.range(\n", - " len(sparse_feature_vocabs[fn]),\n", - " dtype=tf.int64,\n", - " name='{}_vocab_values'.format(fn))\n", - " keys_and_values[fn] = (keys, values)\n", - "\n", - " # Class instance data members.\n", - " self._session = None\n", - " self._loss = None\n", - " self._metrics = {}\n", - " self._embeddings = None\n", - " self._vars_to_inspect = {}\n", - "\n", - " def split_dataframe(df, holdout_fraction=0.1):\n", - " \"\"\"Splits a pandas dataframe into training and test sets.\n", - "\n", - " Args:\n", - " df: the source pandas dataframe.\n", - " holdout_fraction: fraction of dataframe rows to use in the test set.\n", - "\n", - " Returns:\n", - " A pair of non-overlapping pandas dataframe for training and holdout.\n", - " \"\"\"\n", - " test = df.sample(frac=holdout_fraction, replace=False)\n", - " train = df[~df.index.isin(test.index)]\n", - " return train, test\n", - "\n", - " train_dataframe, test_dataframe = split_dataframe(dataframe)\n", - "\n", - " def make_batch(dataframe, batch_size):\n", - " \"\"\"Creates a batch of examples.\n", - "\n", - " Args:\n", - " dataframe: a panda dataframe with rows being examples and with\n", - " columns being feature columns.\n", - " batch_size: the batch size.\n", - "\n", - " Returns:\n", - " A dictionary of tensors, keyed by their feature names.\n", - " Each tensor is of shape [batch_size]. Tensors for sparse features are of\n", - " strings, while tensors for dense features are of floats.\n", - " \"\"\"\n", - " used_features = {ufn: dataframe[ufn] for ufn in used_feature_names}\n", - " batch = (\n", - " tf.data.Dataset.from_tensor_slices(used_features).shuffle(1000)\n", - " .repeat().batch(batch_size).make_one_shot_iterator().get_next())\n", - " if inspect:\n", - " for k, v in six.iteritems(batch):\n", - " self._vars_to_inspect['input_%s' % k] = v\n", - " return batch\n", - "\n", - " def generate_feature_columns(feature_names):\n", - " \"\"\"Creates the list of used feature columns.\n", - "\n", - " Args:\n", - " feature_names: an iterable of strings with the names of the features for\n", - " which feature columns are generated.\n", - "\n", - " Returns:\n", - " A dictionary, keyed by feature names, of _DenseColumn and\n", - " _NumericColumn.\n", - " \"\"\"\n", - " used_sparse_feature_names = (\n", - " tuple(set(sparse_feature_names).intersection(feature_names)))\n", - " used_dense_feature_names = (\n", - " tuple(set(dense_feature_names).intersection(feature_names)))\n", - " f_columns = {}\n", - " for sfn in used_sparse_feature_names:\n", - " sf_column = tf.feature_column.categorical_column_with_vocabulary_list(\n", - " key=sfn,\n", - " vocabulary_list=sparse_feature_vocabs[sfn],\n", - " num_oov_buckets=0)\n", - " f_columns[sfn] = tf.feature_column.embedding_column(\n", - " categorical_column=sf_column,\n", - " dimension=sparse_input_feature_embedding_dims[sfn],\n", - " combiner='mean',\n", - " initializer=tf.truncated_normal_initializer(stddev=.1))\n", - " for dfn in used_dense_feature_names:\n", - " f_columns[dfn] = tf.feature_column.numeric_column(dfn)\n", - " return f_columns\n", - "\n", - " def create_tower(features, columns):\n", - " \"\"\"Creates the tower mapping features to embeddings.\n", - "\n", - " Args:\n", - " features: a dictionary of tensors of shape [batch_size], keyed by\n", - " feature name. Sparse features are associated to tensors of strings,\n", - " while dense features are associated to tensors of floats.\n", - " columns: a dictionary, keyed by feature names, of _DenseColumn and\n", - " _NumericColumn.\n", - "\n", - " Returns:\n", - " A pair of elements: hidden_layer and output_layer.\n", - " hidden_layer is a tensor of shape [batch_size, hidden_dims[-1]].\n", - " output_layer is a dictionary keyed by the output feature names, of\n", - " dictionaries {'labels': labels, 'logits': logits}.\n", - " Dense output features have both labels and logits as float tensors \n", - " of shape [batch_size, 1]. Sparse output features have labels as\n", - " string tensors of shape [batch_size, 1] and logits as float tensors\n", - " of shape [batch_size, len(sparse_feature_vocab)].\n", - " \"\"\"\n", - " # TODO: sanity check the arguments.\n", - " # Input features.\n", - " input_columns = [columns[fn] for fn in input_feature_names]\n", - " hidden_layer = tf.feature_column.input_layer(features, input_columns)\n", - " dense_input_feature_names = (\n", - " tuple(set(dense_feature_names).intersection(input_feature_names)))\n", - " input_dim = (\n", - " sum(sparse_input_feature_embedding_dims.values()) +\n", - " len(dense_input_feature_names))\n", - " for layer_idx, layer_output_dim in enumerate(hidden_dims):\n", - " w = tf.get_variable(\n", - " 'hidden%d_w_' % layer_idx,\n", - " shape=[input_dim, layer_output_dim],\n", - " initializer=tf.truncated_normal_initializer(\n", - " stddev=1.0 / np.sqrt(layer_output_dim)))\n", - " if inspect:\n", - " self._vars_to_inspect['hidden%d_w_' % layer_idx] = w\n", - " hidden_layer = tf.matmul(hidden_layer, w) # / 10.)\n", - " if inspect:\n", - " self._vars_to_inspect['hidden_layer_%d' % layer_idx] = hidden_layer\n", - " input_dim = layer_output_dim\n", - " # Output features.\n", - " output_layer = {}\n", - " for ofn in output_feature_names:\n", - " if ofn in sparse_feature_names:\n", - " feature_dim = len(sparse_feature_vocabs[ofn])\n", - " else:\n", - " feature_dim = 1\n", - " w = tf.get_variable(\n", - " 'output_w_%s' % ofn,\n", - " shape=[input_dim, feature_dim],\n", - " initializer=tf.truncated_normal_initializer(stddev=1.0 /\n", - " np.sqrt(feature_dim)))\n", - " if inspect:\n", - " self._vars_to_inspect['output_w_%s' % ofn] = w\n", - " if use_bias:\n", - " bias = tf.get_variable(\n", - " 'output_bias_%s' % ofn,\n", - " shape=[1, feature_dim],\n", - " initializer=tf.truncated_normal_initializer(stddev=1.0 /\n", - " np.sqrt(feature_dim)))\n", - " if inspect:\n", - " self._vars_to_inspect['output_bias_%s' % ofn] = bias\n", - " else:\n", - " bias = tf.constant(0.0, shape=[1, feature_dim])\n", - " output_layer[ofn] = {\n", - " 'labels':\n", - " features[ofn],\n", - " 'logits':\n", - " tf.add(tf.matmul(hidden_layer, w), bias) # w / 10.), bias)\n", - " }\n", - " if inspect:\n", - " self._vars_to_inspect['output_labels_%s' %\n", - " ofn] = output_layer[ofn]['labels']\n", - " self._vars_to_inspect['output_logits_%s' %\n", - " ofn] = output_layer[ofn]['logits']\n", - " return hidden_layer, output_layer\n", - "\n", - " def similarity_loss(top_embeddings, output_layer):\n", - " \"\"\"Build the loss to be optimized.\n", - "\n", - " Args:\n", - " top_embeddings: First element returned by create_tower.\n", - " output_layer: Second element returned by create_tower.\n", - "\n", - " Returns:\n", - " total_loss: A tensor of shape [1] with the total loss to be optimized.\n", - " losses: A dictionary keyed by output feature names, of tensors of shape\n", - " [1] with the contribution to the loss of each output feature.\n", - " \"\"\"\n", - " losses = {}\n", - " total_loss = tf.scalar_mul(l2_regularization,\n", - " tf.nn.l2_loss(top_embeddings))\n", - " for fn, output in six.iteritems(output_layer):\n", - " if fn in sparse_feature_names:\n", - " losses[fn] = tf.reduce_mean(\n", - " tf.nn.sparse_softmax_cross_entropy_with_logits(\n", - " logits=output['logits'],\n", - " labels=tf.feature_to_id(\n", - " output['labels'], keys_and_values=keys_and_values[fn])))\n", - " else:\n", - " losses[fn] = tf.sqrt(\n", - " tf.reduce_mean(\n", - " tf.square(output['logits'] -\n", - " tf.cast(output['labels'], tf.float32))))\n", - " total_loss += losses[fn]\n", - " return total_loss, losses\n", - "\n", - " # Body of the constructor.\n", - " input_feature_columns = generate_feature_columns(input_feature_names)\n", - " # Train\n", - " with tf.variable_scope('model', reuse=False):\n", - " train_hidden_layer, train_output_layer = create_tower(\n", - " make_batch(train_dataframe, batch_size), input_feature_columns)\n", - " self._train_loss, train_losses = similarity_loss(train_hidden_layer,\n", - " train_output_layer)\n", - " # Test\n", - " with tf.variable_scope('model', reuse=True):\n", - " test_hidden_layer, test_output_layer = create_tower(\n", - " make_batch(test_dataframe, batch_size), input_feature_columns)\n", - " test_loss, test_losses = similarity_loss(test_hidden_layer,\n", - " test_output_layer)\n", - " # Whole dataframe to get final embeddings\n", - " with tf.variable_scope('model', reuse=True):\n", - " self._hidden_layer, _ = create_tower(\n", - " make_batch(dataframe, dataframe.shape[0]), input_feature_columns)\n", - " # Metrics is a dictionary of dictionaries of dictionaries.\n", - " # The 3 levels are used as plots, line colors, and line styles respectively.\n", - " self._metrics = {\n", - " 'total': {\n", - " 'train': {'loss': self._train_loss},\n", - " 'test': {'loss': test_loss}\n", - " },\n", - " 'feature': {\n", - " 'train': {'%s loss' % k: v for k, v in six.iteritems(train_losses)},\n", - " 'test': {'%s loss' % k: v for k, v in six.iteritems(test_losses)}\n", - " }\n", - " }\n", - "\n", - " def train(self,\n", - " num_iterations=30,\n", - " learning_rate=1.0,\n", - " plot_results=True,\n", - " optimizer=tf.train.GradientDescentOptimizer):\n", - " \"\"\"Trains the model.\n", - "\n", - " Args:\n", - " num_iterations: int, the number of iterations to run.\n", - " learning_rate: float, the optimizer learning rate.\n", - " plot_results: bool, whether to plot the results at the end of training.\n", - " optimizer: tf.train.Optimizer, the optimizer to be used for training.\n", - " \"\"\"\n", - " with self._train_loss.graph.as_default():\n", - " opt = optimizer(learning_rate)\n", - " train_op = opt.minimize(self._train_loss)\n", - " opt_init_op = tf.variables_initializer(opt.variables())\n", - " if self._session is None:\n", - " self._session = tf.Session()\n", - " with self._session.as_default():\n", - " self._session.run(tf.global_variables_initializer())\n", - " self._session.run(tf.local_variables_initializer())\n", - " self._session.run(tf.tables_initializer())\n", - " tf.train.start_queue_runners()\n", - "\n", - " with self._session.as_default():\n", - " self._session.run(opt_init_op)\n", - " if plot_results: \n", - " iterations = []\n", - " metrics_vals = {k0: {k1: {k2: []\n", - " for k2 in v1}\n", - " for k1, v1 in six.iteritems(v0)}\n", - " for k0, v0 in six.iteritems(self._metrics)}\n", - "\n", - " # Train and append results.\n", - " for i in range(num_iterations + 1):\n", - " _, results = self._session.run((train_op, self._metrics))\n", - "\n", - " # Printing the 1 liner with losses.\n", - " if (i % 10 == 0) or i == num_iterations:\n", - " print('\\riteration%6d, ' % i + ', '.join(\n", - " ['%s %s %s: %7.3f' % (k0, k1, k2, v2)\n", - " for k0, v0 in six.iteritems(results)\n", - " for k1, v1 in six.iteritems(v0)\n", - " for k2, v2 in six.iteritems(v1)])\n", - " , end=\" \"\n", - " )\n", - " if plot_results:\n", - " iterations.append(i)\n", - " for k0, v0 in six.iteritems(results):\n", - " for k1, v1 in six.iteritems(v0):\n", - " for k2, v2 in six.iteritems(v1):\n", - " metrics_vals[k0][k1][k2].append(results[k0][k1][k2])\n", - "\n", - " # Feedforward the entire dataframe to get all the embeddings.\n", - " self._embeddings = self._session.run(self._hidden_layer)\n", - "\n", - " # Plot the losses and embeddings.\n", - " if plot_results:\n", - " num_subplots = len(metrics_vals) + 1\n", - " colors = 10 * ('red', 'blue', 'black', 'green')\n", - " styles = 10 * ('-', '--', '-.', ':')\n", - " # Plot the metrics.\n", - " fig = plt.figure()\n", - " fig.set_size_inches(num_subplots*10, 8)\n", - " for i0, (k0, v0) in enumerate(six.iteritems(metrics_vals)):\n", - " ax = fig.add_subplot(1, num_subplots, i0+1)\n", - " ax.set_title(k0)\n", - " for i1, (k1, v1) in enumerate(six.iteritems(v0)):\n", - " for i2, (k2, v2) in enumerate(six.iteritems(v1)):\n", - " ax.plot(iterations, v2, label='%s %s' % (k1, k2),\n", - " color=colors[i1], linestyle=styles[i2])\n", - " ax.set_xlim([1, num_iterations])\n", - " ax.set_yscale('log')\n", - " ax.legend()\n", - " # Plot the embeddings (first 3 dimensions).\n", - " ax.legend(loc='upper right')\n", - " ax = fig.add_subplot(1, num_subplots, num_subplots)\n", - " ax.scatter(\n", - " self._embeddings[:, 0], self._embeddings[:, 1],\n", - " alpha=0.5, marker='o')\n", - " ax.set_title('embeddings')\n", - "\n", - "\n", - " @property\n", - " def embeddings(self):\n", - " return self._embeddings" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Anh93kGFUFEt" - }, - "source": [ - "The next cell trains the DNN. You can choose either a predictor DNN or an autoencoder DNN by specifying the parameter `output_feature_names` as follows:\n", - "\n", - "* If choosing a predictor DNN, specify one feature, for example, [`rating`].\n", - "* If choosing an autoencoder DNN, specify all features as follows: `['maker','maker_location','broad_origin','cocoa_percent','bean_type','rating']`.\n", - "\n", - "You do not need to change the other parameters, but if you're curious:\n", - "* `l2_regularization`: Controls the weight for L2 regularization.\n", - "* `hidden_dims`: Controls the dimensions of the hidden layers.\n", - "\n", - "Running the next cell generates the following plots:\n", - "\n", - "* '*total*': Total loss across all features.\n", - "* '*feature*': Loss for the specified output features.\n", - "* '*embeddings*': First two dimensions of the generated embeddings." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "7vcluIucw0BG", - "colab": {} - }, - "source": [ - "#@title Training a DNN Similarity Model\n", - "\n", - "# Define some constants related to this dataset.\n", - "sparse_feature_names = ('maker', 'maker_location', 'broad_origin',\n", - " 'specific_origin', 'bean_type')\n", - "dense_feature_names = ('reference_number', 'review_date', 'cocoa_percent',\n", - " 'rating')\n", - "\n", - "# Set of features used as input to the similarity model.\n", - "input_feature_names = ('maker', 'maker_location', 'broad_origin',\n", - " 'cocoa_percent', 'bean_type','rating', )\n", - "# Set of features used as output to the similarity model.\n", - "output_feature_names = ['rating'] #@param\n", - "\n", - "# As a rule of thumb, a reasonable choice for the embedding dimension of a\n", - "# sparse feature column is the log2 of the cardinality of its vocabulary.\n", - "# sparse_input_feature_embedding_dims = { 'maker': 9, 'maker_location': 6, ... }\n", - "default_embedding_dims = {\n", - " sfn: int(round(math.log(choc_data[sfn].nunique()) / math.log(2)))\n", - " for sfn in set(sparse_feature_names).intersection(input_feature_names)\n", - "}\n", - "# Dictionary mapping each sparse input feature to the dimension of its embedding\n", - "# space.\n", - "sparse_input_feature_embedding_dims = default_embedding_dims # can be a param\n", - "\n", - "# Weight of the L2 regularization applied to the top embedding layer.\n", - "l2_regularization = 10 #@param\n", - "# List of dimensions of the hidden layers of the deep neural network.\n", - "hidden_dims = [20, 10] #@param\n", - "\n", - "print('------ build model')\n", - "with tf.Graph().as_default():\n", - " similarity_model = SimilarityModel(\n", - " choc_data,\n", - " input_feature_names=input_feature_names,\n", - " output_feature_names=output_feature_names,\n", - " dense_feature_names=dense_feature_names,\n", - " sparse_input_feature_embedding_dims=sparse_input_feature_embedding_dims,\n", - " hidden_dims=hidden_dims,\n", - " l2_regularization=l2_regularization,\n", - " batch_size=100,\n", - " use_bias=True,\n", - " inspect=True)\n", - "\n", - "print('------ train model')\n", - "similarity_model.train(\n", - " num_iterations=1000,\n", - " learning_rate=0.1,\n", - " optimizer=tf.train.AdagradOptimizer)\n", - "print('\\n')\n" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ImOGD5GJ8Ia7" - }, - "source": [ - "# 4. Cluster Chocolate Dataset\n", - "We're ready to cluster the chocolates! Run the code to set up the k-means clustering functions. You do not need to understand the code.\n", - "\n", - "**Note**: If you're following the Clustering self study, then before running the rest of this Colab, read the sections on [k-means](https://developers.google.com/machine-learning/clustering/algorithm/run-algorithm) and [quality metrics](https://developers.google.com/machine-learning/clustering/interpret)." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "eExms-TP8Hn6", - "colab": {} - }, - "source": [ - "#@title Run cell to set up functions\n", - "def dfSimilarity(df,centroids):\n", - " ### dfSimilarity = Calculate similarities for dataframe input\n", - " ### We need to calculate ||a-b||^2 = |a|^2 + |b|^2 - 2*|a|*|b|\n", - " ### Implement this with matrix operations\n", - " ### See the Appendix for further explanation\n", - " numPoints = len(df.index)\n", - " numCentroids = len(centroids.index)\n", - " ## Strictly speaking, we don't need to calculate the norm of points\n", - " # because it adds a constant bias to distances\n", - " # But calculating it so that the similarity doesn't go negative\n", - " # And that we expect similarities in [0,1] which aids debugging\n", - " pointNorms = np.square(nla.norm(df,axis=1))\n", - " pointNorms = np.reshape(pointNorms,[numPoints,1])\n", - " ## Calculate the norm of centroids\n", - " centroidNorms = np.square(nla.norm(centroids,axis=1))\n", - " centroidNorms = np.reshape(centroidNorms,(1,numCentroids))\n", - " ## Calculate |a|^2 + |b|^2 - 2*|a|*|b|\n", - " similarities = pointNorms + centroidNorms - 2.0*np.dot(df,np.transpose(centroids))\n", - " # Divide by the number of features\n", - " # Which is 10 because the one-hot encoding means the \"Maker\" and \"Bean\" are\n", - " # weighted twice\n", - " similarities = similarities/10.0\n", - " # numerical artifacts lead to negligible but negative values that go to NaN on the root\n", - " similarities = similarities.clip(min=0.0)\n", - " # Square root since it's ||a-b||^2\n", - " similarities = np.sqrt(similarities)\n", - " return similarities\n", - "\n", - "def initCentroids(df,k,feature_cols):\n", - " # Pick 'k' examples are random to serve as initial centroids\n", - " limit = len(df.index)\n", - " centroids_key = np.random.randint(0,limit-1,k)\n", - " centroids = df.loc[centroids_key,feature_cols].copy(deep=True)\n", - " # the indexes get copied over so reset them\n", - " centroids.reset_index(drop=True,inplace=True)\n", - " return centroids\n", - "\n", - "def pt2centroid(df,centroids,feature_cols):\n", - " ### Calculate similarities between all points and centroids\n", - " ### And assign points to the closest centroid + save that distance\n", - " numCentroids = len(centroids.index)\n", - " numExamples = len(df.index)\n", - " # dfSimilarity = Calculate similarities for dataframe input\n", - " dist = dfSimilarity(df.loc[:,feature_cols],centroids.loc[:,feature_cols])\n", - " df.loc[:,'centroid'] = np.argmin(dist,axis=1) # closest centroid\n", - " df.loc[:,'pt2centroid'] = np.min(dist,axis=1) # minimum distance\n", - " return df\n", - "\n", - "def recomputeCentroids(df,centroids,feature_cols):\n", - " ### For every centroid, recompute it as an average of the points\n", - " ### assigned to it\n", - " numCentroids = len(centroids.index)\n", - " for cen in range(numCentroids):\n", - " dfSubset = df.loc[df['centroid'] == cen, feature_cols] # all points for centroid\n", - " if not(dfSubset.empty): # if there are points assigned to the centroid\n", - " clusterAvg = np.sum(dfSubset)/len(dfSubset.index)\n", - " centroids.loc[cen] = clusterAvg\n", - " return centroids\n", - "\n", - "def kmeans(df,k,feature_cols,verbose):\n", - " flagConvergence = False\n", - " maxIter = 100\n", - " iter = 0 # ensure kmeans doesn't run for ever\n", - " centroids = initCentroids(df,k,feature_cols)\n", - " while not(flagConvergence):\n", - " iter += 1\n", - " #Save old mapping of points to centroids\n", - " oldMapping = df['centroid'].copy(deep=True)\n", - " # Perform k-means\n", - " df = pt2centroid(df,centroids,feature_cols)\n", - " centroids = recomputeCentroids(df,centroids,feature_cols)\n", - " # Check convergence by comparing [oldMapping, newMapping]\n", - " newMapping = df['centroid']\n", - " flagConvergence = all(oldMapping == newMapping)\n", - " if verbose == 1:\n", - " print(\"Total distance:\" + str(np.sum(df['pt2centroid'])))\n", - " if (iter > maxIter):\n", - " print('k-means did not converge! Reached maximum iteration limit of ' \\\n", - " + str(maxIter) + '.')\n", - " sys.exit()\n", - " return\n", - " print('k-means converged for ' + str(k) + ' clusters' + \\\n", - " ' after ' + str(iter) + ' iterations!')\n", - " return [df,centroids]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "-KnRLWvw1rJ9" - }, - "source": [ - "Run the following cell to cluster the chocolate dataset, where `k` is the number of clusters. You'll experiment with different values of `k` later. For now, use `k = 160`.\n", - "\n", - "On every iteration of k-means, the output shows how the sum of distances from all examples to their centroids reduces, such that k-means always converges. The following table shows the data for the first few chocolates. On the extreme right of the table, check the assigned centroid for each example in the `centroid` column and the distance from the example to its centroid in the `pt2centroid` column.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "AKDwhN9J1PhU", - "colab": {} - }, - "source": [ - "k = 160 #@param\n", - "\n", - "# Extract embeddings into a dataframe\n", - "choc_embed = similarity_model.embeddings\n", - "choc_embed = pd.DataFrame(choc_embed)\n", - "\n", - "feature_cols = choc_embed.columns.values # save original columns\n", - "# initialize every point to an impossible value, the k+1 cluster\n", - "choc_embed['centroid'] = k\n", - "# init the point to centroid distance to an impossible value \"2\" (>1)\n", - "choc_embed['pt2centroid'] = 2\n", - "[choc_embed,centroids] = kmeans(choc_embed,k,feature_cols,1)\n", - "print(\"Data for the first few chocolates, with 'centroid' and 'pt2centroid' on the extreme right:\")\n", - "choc_embed.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "m6kE9uVnXjy4" - }, - "source": [ - "## Inspect Clustering Result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "13TnsPz23xOU" - }, - "source": [ - "Inspect the chocolates in different clusters by changing the parameter `clusterNumber`\n", - "in the next cell and running the cell. Consider these questions as you inspect the clusters:\n", - "\n", - "* Are the clusters meaningful?\n", - "* Is the clustering result better with a manual similarity measure (see your previous Colab) or a supervised similarity measure?\n", - "* Does changing the number of clusters make the clusters more or less meaningful?\n", - "\n", - "For context, on the page [Supervised Similarity Measure](https://developers.google.com/machine-learning/clustering/similarity/supervised-similarity), read the table \"*Comparison of Manual and Supervised Measures*\". Then click the next cell for the discussion." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "NHWgGmpyux39", - "colab": {} - }, - "source": [ - "clusterNumber = 20 #@param\n", - "choc_data.loc[choc_embed['centroid']==clusterNumber,:]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MJtuP9w5jJHq" - }, - "source": [ - "### Solution: Discussion of clustering results\n", - "Click below for the answer." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "gxiPD8g_jShi" - }, - "source": [ - "**Discussion**:\n", - "\n", - "**Q. Are the clusters meaningful?** \n", - "\n", - "The clusters become more meaningful when you increase the number of clusters above approximately 100. Below ~100 clusters, dissimilar chocolates tend to be grouped together. Specifically, the grouping of numeric features is more meaningful than the categorical features. A possible cause is that the DNN isn't accurately encoding the categorical features because ~1800 examples isn't enough data to encode each of the dozens of values that categorical features have.\n", - "\n", - "**Q. Is the clustering result better with a manual similarity measure or a supervised similarity measure?**\n", - "\n", - "The clusters are more meaningful for the manual similarity measure because you customized the measure to accurately capture similarity between chocolates. Manual design was possible because the dataset was not complex. In comparison, in your supervised similarity measure, you just threw your data into the DNN and relied on the DNN to encode the similarity. The disadvantage is that with such a small dataset, the DNN lacks the data to accurately encode similarity.\n", - "\n", - "**Q. Does changing the number of clusters make the clusters more or less meaningful?**\n", - "\n", - "Increasing the number of clusters makes the clusters more meaningful up to a limit, because dissimilar chocolates can be broken up into distinct clusters." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Z1eW0PlG57Zs" - }, - "source": [ - "# 5. Quality Metrics for Clusters\n", - "For the clusters, let's calculate the metrics discussed in [Interpret Results](https://developers.google.com/machine-learning/clustering/interpret). Read that course content before starting this code section.\n", - "\n", - "Run the next cell to set up functions." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "i9Y2H-nR56C3", - "colab": {} - }, - "source": [ - "#@title Run cell to setup functions { display-mode: \"form\" }\n", - "def clusterCardinality(df):\n", - " k = np.max(df[\"centroid\"]) + 1\n", - " if six.PY2:\n", - " k = k.astype(int)\n", - " print(\"Number of clusters:\"+str(k))\n", - " clCard = np.zeros(k)\n", - " for kk in range(k):\n", - " clCard[kk] = np.sum(df[\"centroid\"]==kk)\n", - " if six.PY2:\n", - " clCard = clCard.astype(int)\n", - " # print \"Cluster Cardinality:\"+str(clCard)\n", - " plt.figure()\n", - " plt.bar(range(k),clCard)\n", - " plt.title('Cluster Cardinality')\n", - " plt.xlabel('Cluster Number: '+str(0)+' to '+str(k-1))\n", - " plt.ylabel('Points in Cluster')\n", - " return clCard\n", - "\n", - "def clusterMagnitude(df):\n", - " k = np.max(df[\"centroid\"]) + 1\n", - " if six.PY2:\n", - " k = k.astype(int)\n", - " cl = np.zeros(k)\n", - " clMag = np.zeros(k)\n", - " for kk in range(k):\n", - " idx = np.where(df[\"centroid\"]==kk)\n", - " idx = idx[0]\n", - " clMag[kk] = np.sum(df.loc[idx,\"pt2centroid\"])\n", - " # print \"Cluster Magnitude:\",clMag #precision set using np pref\n", - " plt.figure()\n", - " plt.bar(range(k),clMag)\n", - " plt.title('Cluster Magnitude')\n", - " plt.xlabel('Cluster Number: '+str(0)+' to '+str(k-1))\n", - " plt.ylabel('Total Point-to-Centroid Distance')\n", - " return clMag\n", - "\n", - "def plotCardVsMag(clCard,clMag):\n", - " plt.figure()\n", - " plt.scatter(clCard,clMag)\n", - " plt.xlim(xmin=0)\n", - " plt.ylim(ymin=0)\n", - " plt.title('Magnitude vs Cardinality')\n", - " plt.ylabel('Magnitude')\n", - " plt.xlabel('Cardinality')\n", - "\n", - "def clusterQualityMetrics(df):\n", - " clCard = clusterCardinality(df)\n", - " clMag = clusterMagnitude(df)\n", - " plotCardVsMag(clCard,clMag)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "1nLYPlv4ejwD" - }, - "source": [ - "Calculate the following metrics by running the next cell:\n", - "\n", - "* cardinality of your clusters\n", - "* magnitude of your clusters\n", - "* cardinality vs magnitude\n", - "\n", - "Observe:\n", - "* The plots show that inspecting cluster metrics for many clusters isn't easy. However, the plots provide a general idea of the quality of the clustering. There are a number of outlying clusters.\n", - "* The correlation between cluster cardinality and cluster magnitude is lower than it was for a manual similarity measure. The lower correlation shows that some chocolates were harder to cluster, leading to large example-centroid distances.\n", - "\n", - "Experiment by changing these options and checking the result:\n", - "* dimensions of DNN's hidden layer\n", - "* autoencoder or predictor DNN\n", - "* number of clusters" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "3llKFtEpeiZ_", - "colab": {} - }, - "source": [ - "clusterQualityMetrics(choc_embed)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "SBa0k9KK2PAt" - }, - "source": [ - "## Find Optimum Number of Clusters\n", - "\n", - "Let's try finding the right number of clusters as you did in the previous programming exercise. For details, read \"*Step Three: Optimum Number of Clusters*\" on [Interpret Results](https://developers.google.com/machine-learning/clustering/interpret).\n", - "\n", - "Run the code below (it takes a while!). The resulting plot is uneven for low `k`, showing that the k-means has a difficult time clustering the data. As `k` increases past 100, the loss evens out, showing that k-means is effectively grouping the data into clusters." - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "colab_type": "code", - "id": "-df7QnPlhuIN", - "colab": {} - }, - "source": [ - "# Plot loss vs number of clusters\n", - "def lossVsClusters(kmin, kmax, kstep, choc_data):\n", - " kmax += 1 # include kmax-th cluster in range\n", - " kRange = range(kmin, kmax, kstep)\n", - " loss = np.zeros(len(kRange))\n", - " lossCtr = 0\n", - " for kk in kRange:\n", - " [choc_data, centroids] = kmeans(choc_data, kk, feature_cols, 0)\n", - " loss[lossCtr] = np.sum(choc_data['pt2centroid'])\n", - " lossCtr += 1\n", - " plt.scatter(kRange, loss)\n", - " plt.title('Loss vs Clusters Used')\n", - " plt.xlabel('Number of clusters')\n", - " plt.ylabel('Total Point-to-Centroid Distance')\n", - "\n", - "\n", - "kmin = 5 # @param\n", - "kmax = 200 # @param\n", - "kstep = 10 # @param\n", - "lossVsClusters(kmin, kmax, kstep, choc_embed)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hK5iKbQ9k5EJ" - }, - "source": [ - "# Summary\n", - "\n", - "The codelab demonstrates these characteristics of a supervised similarity measure, described on the page [Supervised Similarity Measure](https://developers.google.com/machine-learning/clustering/similarity/supervised-similarity) in the table \"*Comparison of Manual and Supervised Measures*\":\n", - "\n", - "* **Eliminates redundant information in correlated features**. As discussed in this [section](#scrollTo=MJtuP9w5jJHq), the DNN eliminates redundant information. However, to prove this characteristic, you'd need to train the DNN on adequate data and then compare with the results of a manual similarity measure.\n", - "* **Does not provides insight into calculated similarities**. Because you do not know what the embeddings represent, you have no insight into the clustering result.\n", - "* **Suitable for large datasets with complex features**. Our dataset was too small to adequately train the DNN, demonstrating that DNNs need large datasets to train. The advantage is that you do not need to understand the input data. Since large datasets are not easy to understand, these two characteristics go hand-in-hand.\n", - "* **Not suitable for small datasets**. A small dataset does not have enough information to train the DNN." - ] - } - ] -} \ No newline at end of file + "nbformat": 4, + "nbformat_minor": 0 +}