In [1]:
# Import all libraries

import pandas as pd
import numpy as np
from pathlib import Path
import os ,glob
import sqlite3
import folium
import gower

from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.cluster import AgglomerativeClustering 

from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster
from folium.plugins import MarkerCluster
from folium.features import CustomIcon

from tqdm import tqdm
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


import random
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import clone
from sklearn.metrics import calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean, cityblock
from prince import MFA

from IPython.core.display import HTML
import pprint

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
sns.set_style("white")
In [2]:
pp = pprint.PrettyPrinter(indent=4, width=100)


HTML('''
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}


</style>

<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')
Out[2]:
In [3]:
def clean_data(df):
    """ Return cleaned data."""
    
    df1 = df.copy()
    
    
    # MULTI-CUISINE
    df1['count_cuisine'] = df1.cuisines.str.split(',').apply(lambda x: np.size(x))
    df1['multi_cuisine'] = df1['count_cuisine'].apply(lambda x: 1 if x>1 else 0)
    
    
    # df1['cuisines'] = df.cuisines.str.split(',', n=0, expand = True)
    #           .drop(columns=[1,2,3,4,5,6]))
    # df1.drop(columns='cusines', inplace=True)

    # add new column if a restaurant has a rating or not
    
    df1['has_rating'] = df1['user_rating.aggregate_rating'].apply(lambda x: 1 if x>1 else 0) 
    
    # one-hot encode cuisines
    space_strip = lambda x : 'cuisine_' + x.strip()
    df1.cuisines = df1.cuisines.apply(lambda x : 
                                      list(map(space_strip,
                                               list(set(x.split(','))))))
    df1 = df1.drop('cuisines', 1).join(df1.cuisines.str.join('|')
                                          .str.get_dummies())
    
    # one-hot encode highlights
    hlight_strip = (lambda x : 'highlight_' + x.strip().lstrip("'")
                                               .rstrip("'"))
    df1.highlights = df1.highlights.apply(lambda x : 
                                          list(map(hlight_strip, 
                                                   x[1:-1].split(","))))
    df1 = df1.drop('highlights', 1).join(df1.highlights.str.join('|')
                                            .str.get_dummies())
    
    # one-hot encode establishment
    est_strip = (lambda x : 'establishment_' + x.strip().lstrip("'")
                                                .rstrip("'"))
    df1.establishment = df1.establishment.apply(lambda x : 
                                                list(map(est_strip, 
                                                         x[1:-1].split(","))))
    df1 = df1.drop('establishment', 1).join(df1.establishment.str.join('|')
                                            .str.get_dummies())
    
    return df1



def plot_clusters(X, ys):
    """Plot clusters given the design matrix and cluster labels"""
    k_max = len(ys) + 1
    k_mid = k_max//2 + 2
    fig, ax = plt.subplots(2, k_max//2, dpi=150, sharex=True, sharey=True, 
                           figsize=(10,5))
    for k,y in zip(range(2, k_max+1), ys):
        if k < k_mid:
            ax[0][k%k_mid-2].scatter(*zip(*X), c=y, cmap = 'Set3',
                                     s=5, alpha=0.9)
            ax[0][k%k_mid-2].set_title('$k=%d$'%k)
            
        else:
            ax[1][k%k_mid].scatter(*zip(*X), c=y, cmap = 'Set3',
                                   s=5, alpha=0.9)
            ax[1][k%k_mid].set_title('$k=%d$'%k)

    return ax


def pooled_within_ssd(X, y, centroids, dist):
    """Compute pooled within-cluster sum of squares around the cluster mean
    
    Parameters
    ----------
    X : array
        Design matrix with each row corresponding to a point
    y : array
        Class label of each point
    centroids : array
        Number of pairs to sample
    dist : callable
        Distance between two points. It should accept two arrays, each 
        corresponding to the coordinates of each point
        
    Returns
    -------
    float
        Pooled within-cluster sum of squares around the cluster mean
    """

    return sum(dist(x_i, centroids[y_i])**2/(2*(y == y_i).sum())
               for x_i, y_i in zip(X, y))


def gap_statistic(X, y, centroids, dist, b, clusterer, random_state=None):
    """Compute the gap statistic
    
    Parameters
    ----------
    X : array
        Design matrix with each row corresponding to a point
    y : array
        Class label of each point
    centroids : array
        Number of pairs to sample
    dist : callable
        Distance between two points. It should accept two arrays, each 
        corresponding to the coordinates of each point
    b : int
        Number of realizations for the reference distribution
    clusterer : KMeans
        Clusterer object that will be used for clustering the reference 
        realizations
    random_state : int, default=None
        Determines random number generation for realizations
        
    Returns
    -------
    gs : float
        Gap statistic
    gs_std : float
        Standard deviation of gap statistic
    """
    
    rng = np.random.default_rng(random_state)
    minimum = X.min(0)
    maximum = X.max(0)

    log_wk = []
    for i in range(b):
        clusterer1 = clone(clusterer)
        clusterer1.set_params(n_clusters=len(centroids))
        # drawing from a uniform distribution
        X_ = rng.uniform(minimum, maximum, size=X.shape)
        # fitting the drawn values
        y_ = clusterer1.fit_predict(X_)
        log_wk.append(np.log(pooled_within_ssd(X_, y_, 
                                               clusterer1.cluster_centers_, 
                                               dist)))
    
    return [np.mean(log_wk - np.log(pooled_within_ssd(X, y, centroids, 
                                                      dist))),
            np.std(log_wk - np.log(pooled_within_ssd(X, y, centroids, dist)))]


def cluster_range(X, clusterer, k_start, k_stop, actual=None):
    """Perform k-Means clustering"""
    ys = []
    inertias = []
    chs = []
    scs = []
    gss = []
    gssds = []
    ps = []
    amis = []
    ars = []
    for k in range(k_start, k_stop+1):
        clusterer_k = clone(clusterer)
        clusterer_k.set_params(n_clusters=k)
        y = clusterer_k.fit_predict(X)
        ys.append(y)
        inertias.append(clusterer_k.inertia_)
        try:
            chs.append(calinski_harabasz_score(X, y))
        except:
            chs.append(np.nan)
        try:
            scs.append(silhouette_score(X, y))
        except:
            scs.append(np.nan)
        if actual is not None:
            ps.append(purity(actual, y))
            amis.append(adjusted_mutual_info_score(actual, y))
            ars.append(adjusted_rand_score(actual, y))
        gs = gap_statistic(X, y, clusterer_k.cluster_centers_, 
                                 euclidean, 5, 
                                 clone(clusterer).set_params(n_clusters=k), 
                                 random_state=1337)
        gss.append(gs[0])
        gssds.append(gs[1])
    
    return_dict = {'ys' : ys, 'inertias' : inertias, 'chs' : chs,
                   'gss' : gss, 'gssds' : gssds, 'scs' : scs}
    
    if actual is not None:
        return_dict['ps'] = ps
        return_dict['amis'] = amis
        return_dict['ars'] = ars
    return return_dict


def plot_internal(inertias, chs, scs, gss, gssds):
    """Plot internal validation values."""
    fig, ax = plt.subplots(4,1,dpi=100, figsize=(8,12))
    plt.subplots_adjust(hspace=.3)
    ks = np.arange(2, len(inertias)+2)
    ax[0].plot(ks, inertias, '-o', label='SSE', c='#E23744')
    ax[1].plot(ks, chs, '-o', label='CH', c='#E23744')
    ax[0].set_xlabel('$k$')
    ax[1].set_xlabel('$k$')
    ax[2].plot(ks, gss, '-o', label='Gap statistic', color = '#E23744')
    ax[3].plot(ks, scs, '-o', label='Silhouette coefficient', c='#E23744')
    ax[2].set_xlabel('$k$')
    ax[3].set_xlabel('$k$')
    ax[0].legend()
    ax[1].legend()
    ax[2].legend()
    ax[3].legend()
    return ax


def add_cluster(df, group, color):
    """Plot clusters in a map"""
    feature_group = folium.FeatureGroup(group)
    for row in tqdm(df.itertuples()):
        popup_str = f"""<b>{row.name}</b> <br>"""
        popup = folium.Tooltip(popup_str)
        feature_group.add_child(folium.Circle(location=[row.latitude,
                                                        row.longitude],
                                              radius=20, 
                                              color=color,
                                              fill= True,
                                              fill_color = color,
                                              fill_opacity=0.8,
                                              opacity=0.8,
                                              tooltip = popup))
    return feature_group

Clustering Restaurants in Makati

A data-driven approach to food tripping

EXECUTIVE SUMMARY

Implementing a clustering method of restaurants provides a way to create a data-driven approach of choosing where to eat. As such, this study sought to find out whether the said method will show significant results especially when done in the city of Makati. This study as well aimed to determine what the cluster’s features are and how it can help someone know where to satisfy their palettes. Data was gathered via the Zomato API, yielding 2375 data points pertaining to restaurants. Once mined and wrangled, dataset was reduced via multiple factor analysis. After which, the cleaned data was processed via $k$-means clustering. Internal validation methods were used to determine the most appropriate number of clusters with the dataset. After implementing the k-Means clustering method and internal validation checks, 4 clusters were chosen which were labeled as the following: namely, Quick Bites, Fast Food, Casual Dining and Full-Service Restaurants. The top features and top restaurants per cluster were taken to provide a brief idea of themes of each cluster. This can help several stakeholders in providing an unsupervised learning method as a way of making informed decisions on where to go to eat.

INTRODUCTION

Makati City is the hub for business and culture in the Philippines, and it houses the biggest financial district in the country. It has a population of 582,605, which booms to around 4.2M in the day. Recently though, it was notably identified as the gastronomic epicenter of the Philippines. Businesses and owners have taken advantage of this fact by building more restaurants and food places. As a result, various types of these food places have sprouted everywhere so much that it’s hard to determine which one is which. Because of this, people often have a hard time choosing where to fill their palettes. This study aims to eliminate that by having a data driven approach on choosing where to eat. Thus, we sought to answer the following question: What clusters would emerge based on the features of restaurants in Makati?

BUSINESS VALUE

In doing this study, 4 major stakeholders were considered so we could determine how beneficial this study is for their case, and these are the customers, restaurant owners, policymakers, and Zomato.

  • For Customers. Customers in Makati often have the problem of deciding where to eat because of the plethora of restaurants to choose from. Since this study clusters restaurants based on some latent features like price and ratings, this can at least help customers drill down their choices.
  • For Business Owners. Many factors come into play when deciding on where to put your first or next business venture, and this is especially true when it comes to opening restaurants. It’s not just location that has to be taken into account. Certain factors like competition in an area, density of restaurants in a location, and so on, can help decide if a given location is optimal. This study can aide the decision of business owners as they can give them information about a certain cluster. Also, for current restaurant owners, this can help them improve their services by looking at the latent features of the clusters and seeing which features affect either the ratings or customer feedback of their restaurant.
  • For Policymakers. When clusters are determined, policymakers can have a look at these and study the density and the competition in that area. This can lead to some changes in city-wide developments in terms of improving traffic, tax revenue collection, etc.
  • For Zomato. Zomato’s current recommendation framework works by suggesting the best restaurants based on the type of food that people prefer. If they can’t decide, they can simply type in the city and Zomato will give restaurant suggestions based on ratings. While it currently is doing the job for their platform, this study can give them an idea of how unsupervised learning can augment their current framework.
METHODOLOGY

To be able to identify clusters of restaurants in Makati city, a total of 2,375 restaurants were extracted using the Zomato API (https://developers.zomato.com/api). The general workflow for clustering the restaurants as shown in Figure 1 involves the following steps:

  1. Data Extraction
  2. Data Preprocessing
  3. Data Storage
  4. Exploratory Data Analysis
  5. Feature Extraction via One-Hot Encoding
  6. Dimensionality Reduction using Multiple Factor Analysis (MFA)
  7. Unsupervised clustering using $k$-Means algorithm

Each step of the workflow will be discussed in the succeeding sections.

Figure 1. General workflow for clustering Zomato restaurants in Makati City

1. Data Extraction

Restaurants in Makati were extracted using the Zomato API (https://developers.zomato.com/api) via web scraping tools, specifically Python’s requests module. The process involved extracting the restaurant features and reviews. Initial cleaning was also implemented to remove irrelevant columns. There are 3 csv files extracted from this step, zomato_restaurantcs.csv, zomato_reviews.csv, and zomato_address.csv.

The source code for scraping the restaurants is in Appendix A.

2. Data Preprocessing

Data preprocessing was implemented on the acquired restaurants data. The preprocessing involves:

  • Restaurants: removing restaurants that are out-of-bounds and have missing data (i.e. outside Makati city, missing location.latidue and location.longitdue, and missing establishment).
  • Address: using spatial join to acquire barangay per restaurant
In [4]:
df = pd.read_csv('zomato_restaurants.csv')

# Not in Makati City
df = df[(df['location.latitude'] != 0) & (df['location.longitude'] != 0)]
# empty data
df = df.set_index('id').drop([19234265, 19043191.0, 19307546.0, 6305044.0, 6308954.0,
                              6312056.0, 18153422.0, 19306478.0, 19312802.0,
                              19110184.0, 19110323.0]).reset_index()
In [5]:
df_barangay = pd.read_csv('zomato_address.csv')[['id', 'barangay']]
df_barangay = df_barangay[df_barangay['id'].isin(df['id'].to_list())]
In [6]:
df_reviews = pd.read_csv('zomato_reviews.csv')

3. Data Storage

The extracted datasets were correponding to 2,375 restaurants were stored in a local sqlite database. A total of 3 tables were created: restaurants, reviews, and address. Table 1 shows the data description of all data fields used in the study.

In [7]:
file = 'zomato.db'
conn = sqlite3.connect(file)
df.to_sql('restaurants', conn, if_exists='replace', index=False)

df_reviews.to_sql('reviews', conn, if_exists='replace', index=False)

df_barangay.to_sql('address', conn, if_exists='replace', index=False)

Table 1. Data Description

Data Field Description Table
ID Unique ID of the restaurant All
Name Name of the restaurant restaurants
Latitude Latitude value of the location of the restaurant restaurants
Longitude Longitude value of the location of the restaurant restaurants
Average Cost for Two Cost of food for two persons restaurants
Price Range Price group the restaurant belongs to\, 1 as the cheapest and 4 as the most expensive restaurants
Photos Count Count of food/restaurant photos shared by customers restaurants
Rating Over-all rating of the restaurant restaurants
Has Rating Whether restaurant is rated or not restaurants
Votes Count of ratings given restaurants
Cuisines List of cuisines offered restaurants
Cusine Count Total number of cuisines offered restaurants
Multi-cuisine Whether restaurant has offers more than one cuisine or not restaurants
Highlights List of features/amenities available restaurants
Has Table Reservation Whether restaurant allows table reservation or not restaurants
Has Table Booking Whether restaurant allows table booking or not restaurants
Reviews Count Count of reviews left by customers restaurants
Reviews List of 5 most recent reviews for the restaurant reviews
Barangay Barangay where the restaurant is located address

Table 2. First 5 rows of restaurants table

In [8]:
pd.read_sql('SELECT * from restaurants', conn).head(5)
Out[8]:
id name cuisines average_cost_for_two price_range highlights all_reviews_count photo_count is_table_reservation_supported has_table_booking establishment location.latitude location.longitude user_rating.aggregate_rating user_rating.votes
0 6301456.0 Central BBQ Boy Grill Filipino 800.0 3.0 ['Cash', 'Dinner', 'Restroom available', 'Nigh... 11.0 26.0 0.0 0.0 ['Bar'] 14.557535 121.005698 3.5 22.0
1 6301475.0 Claudette's American, Desserts 600.0 2.0 ['Lunch', 'Breakfast', 'Dinner', 'Credit Card'... 34.0 90.0 0.0 0.0 ['Casual Dining'] 14.559123 121.019683 3.7 73.0
2 6301802.0 Sumo Sam Japanese 1200.0 3.0 ['Lunch', 'Serves Alcohol', 'Credit Card', 'Di... 38.0 129.0 0.0 0.0 ['Casual Dining'] 14.565058 121.036706 3.0 89.0
3 6301810.0 Cafe France Coffee, Bakery 600.0 2.0 ['Credit Card', 'Lunch', 'No Alcohol Available... 18.0 66.0 0.0 0.0 ['Casual Dining'] 14.550726 121.025261 2.7 31.0
4 6301843.0 Modern China Restaurant Chinese 1200.0 3.0 ['Lunch', 'Credit Card', 'Dinner', 'Cash', 'Wi... 23.0 107.0 0.0 0.0 ['Casual Dining'] 14.550892 121.026684 3.9 58.0

Table 3. First 5 rows of reviews table

In [9]:
pd.read_sql('SELECT * from reviews', conn).head(5)
Out[9]:
id review
0 18972245.0 It's a authentic Taiwanese style steak!
1 18972245.0 Ubod ng kunat na steak makunat pa sa ninong mo...
2 18647867.0 Great beer selection! Nice place to hang out a...
3 18647867.0 Beers were of extensive selection. Buy 1 get 1...
4 6313503.0 Good service from the staff. Not sure if true ...

Table 4. First 5 rows of address table

In [10]:
pd.read_sql('SELECT * from address', conn).head(5)
Out[10]:
id barangay
0 6301456.0 San Isidro
1 6301475.0 Bel-Air
2 6301802.0 Guadalupe Viejo
3 6301810.0 San Lorenzo
4 6301843.0 San Lorenzo

4. Exploratory Data Analysis

The dataset extracted from Zomato is ample and with sufficient information about restaurants in Makati city. An important step for understanding what is possible to do with the dataset was to perform exploratory data analysis by understanding the different features.

Figure 2 shows the location of the different restaurants in Makati city. It can be seen how these restaurants tend to agglomerate in busy areas such as Poblacion, and the shopping malls in Ayala. Additionally, different areas are composed of restaurants with varying price range and user rating.

In [11]:
viz = pd.read_sql('SELECT * from restaurants', conn)
token = open("mapbox_token.txt").read() # you need your own token

fig = px.scatter_mapbox(viz,
                        lat="location.latitude",
                        lon="location.longitude",
                        size="average_cost_for_two",
                        color="user_rating.aggregate_rating",
                        color_continuous_scale=px.colors.sequential.thermal,
                        hover_name = 'name',
                        hover_data = {'average_cost_for_two': True,
                                      'user_rating.aggregate_rating': True,
                                      'photo_count': True,
                                      'establishment': True, 
                                      'highlights': False,
                                      'cuisines': True,
                                      'location.latitude':False,
                                      'location.longitude':False},
                       labels = {'user_rating.aggregate_rating': 'rating'})

fig.update_layout(
    mapbox = {
        'accesstoken': token,
        'style': "streets", 'zoom': 12},
    showlegend = False)

fig.show(renderer = 'notebook')

Figure 2. Restaurants in Makati City (size of bubble is based on average_price_for_two and color based on user_rating.aggregate_rating).

Figure 3 shows the varying cuisines that restaurants in Makati offer with Filipino as the top cuisine in terms of number of restaurants/food places. Additionally, the most popular establishments are Casual Dining, Quick Bites, and Beverages Shops in terms of number of restaurants/food places.

In [12]:
viz = pd.read_sql('SELECT * from restaurants', conn)
viz['cuisines'] = (viz.cuisines.str.split(',', n=0, expand=True)
                      .drop(columns=[1, 2, 3, 4, 5, 6]))
viz['establishment'] = viz['establishment'].astype(
    str).str.replace('\[|\]|\'', '')
viz['establishment'] = viz['establishment'].replace(
    to_replace='establishment_', value='', regex=True)


small = 10

plt.rc('font', size=small)          # controls default text sizes
plt.rc('axes', titlesize=small)     # fontsize of the axes title
plt.rc('axes', labelsize=small)     # fontsize of the axes title
plt.rc('xtick', labelsize=small)    # fontsize of the tick labels
plt.rc('ytick', labelsize=small)    # fontsize of the tick labels
plt.rc('legend', fontsize=small)    # legend fontsize
plt.rc('figure', titlesize=small)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 5))

pop_cuisines = viz.groupby(['cuisines']).count(
)[['id']].sort_values(by='id', ascending=False)[:10]
                                                
pop_establishments = viz.groupby(['establishment']).count()[
    ['id']].sort_values(by='id', ascending=False)[:10]

sns.barplot(x='id', y=pop_cuisines.index,
            data=pop_cuisines,
            color='#E23744', ax=ax1)
ax1.set(xlabel='Number of Restaurants', ylabel='Primary Cuisine')

sns.barplot(x='id', y= pop_establishments.index, data=pop_establishments,
            color='#E23744', ax=ax2)
ax2.set(xlabel='Number of Restaurants', ylabel='Establishments')

fig.tight_layout(pad=3.0)

Figure 3. Top 10 Primary Cuisines and Establishment in terms of number of restaurants in Makati City

5. Feature Extraction

Several features that could improve the performance of the clustering method were processed and derived. Specifically, feature extraction was based on the original features - cuisine, highlight, establishment and user_rating.aggregate_rating.

One-hot encoded features. For the cuisine, highlight and establishment, the values under them are in list form. Hence, these features were one-hot encoded.

Derived features. Three columns were derived from existing features:

  • has_rating (boolean) - if a restaurant has ratings

  • count_cuisine (integer) - number of cuisines of a restaurant

  • multi_cuisine (boolean) - if a restaurant has multiple cuisines

In [13]:
df_clean = clean_data(pd.read_sql("""SELECT * FROM restaurants""", conn))
df_info = df_clean[['id', 'name', 'location.latitude', 'location.longitude']]
df_num = df_clean.drop(['id', 'name',
                        'location.latitude', 'location.longitude'],
                       axis=1)
df_num = df_num[['average_cost_for_two', 'all_reviews_count',
                 'photo_count', 'user_rating.aggregate_rating', 
                 'user_rating.votes', 'count_cuisine', 
                 'is_table_reservation_supported', 
                 'has_table_booking'] + df_num.columns[10:].tolist()]

# convert one-hot encoded features to string
df_num.iloc[:, 7:] = df_num.iloc[:, 7:].astype(int).astype(str)

6. Dimensionality Reduction

After performing the feature extraction, the data set had 2,375 rows and 165 columns. So, the dimensions were reduced to improve the accuracy and computational power of the clustering algorithm used.

Multiple factor analysis (MFA) was performed to reduce the number of columns that would be considered in the clustering process. Since MFA works well with grouped features of the same data type, this dimensionality reduction was performed on the one-hot encoded features of cuisine, highlight and establishment. Furthermore, grouping features with the same nature of values is a crucial step in performing MFA. Thus, we grouped these features based on cuisine, highlight and establishment. We chose the top three components which retained the highest variance explained. The selected components were able to retaine $94.54\%$ of the variance explained.

After reducing the dimension, the final design matrix that was used in the clustering process has 2,375 rows and 12 columns.

In [14]:
groups = {'cuisines' : df_num.columns[9:71].tolist(),
          'highlights' : df_num.columns[71:144].tolist(),
          'establishments' : df_num.columns[144:].tolist()}
In [15]:
mfa = MFA(groups=groups, n_components=3, n_iter=1000, random_state=42)

mfa.fit(df_num.iloc[:, 9:])
df_cat_mfa = mfa.transform(df_num.iloc[:, 9:])
In [16]:
df_cat_mfa.columns = ['comp_cuisine', 'comp_estab', 'comp_highlight']
explained_intertia = sum(mfa.explained_inertia_)

7. Unsupervised Clustering

To determine the clusters of Zomato-listed Makati restaurants formed based on the features included in this study, $k$-means clustering method was used. Before performing the clustering process, the values in the design matrix were normalized to even out the magnitude of the values in each column. In the clustering process, values from $k=2$ to $k=9$ clusters were considered. To provide a visual representation of the clustering for each value of $k$, a Principal Component Analysis (PCA) was performed. Then, for each value of $k$, the two principal components with the highest variance explained were plotted in a two-dimensional plane to have an overview of the clustering. For a quantitative measure of the appropriate number of clusters, an internal validation was performed. Specifically, the internal validation criteria used were (1) sum of squared distances to the centroids, (2) Calinski-Harabasz index, (3) Silhouette coefficient and (4) Gap statistic.

RESULTS

After performing $k$-means clustering with number of clusters $k=2$ to $k=9$, and plotting the two principal components with the highest variance explained under PCA in a two-dimensional plane, the graphs in Figure 4 were derived.

In [17]:
X = df_num.iloc[:, :9].merge(df_cat_mfa, left_index=True, right_index=True)
X.iloc[:,7:9] = X.iloc[:,7:9].astype(int)
In [18]:
X_norm = normalize(X)

In [19]:
X_new = PCA(n_components=2, random_state=1337).fit_transform(X_norm)
res = cluster_range(X_norm, KMeans(random_state=1337), 2, 9, actual=None)
In [20]:
plot_clusters(X_new, res['ys']);

Figure 4. Two-component (PCA) representation of $k$ clusters

Then, the results of the internal validation using the four criteria mentioned are shown in Figure 5.

In [21]:
plot_internal(res['inertias'], res['chs'], res['scs'],
              res['gss'], res['gssds']);

Figure 5. Internal validation results.

Looking at the internal validation results, the selected values of $k$ actually show relatively good results. Hence, for interpretability purpose, we chose $k=4$, that is, four clusters of Zomato-listed Makati restaurants. Basing on the SSE criterion, the ‘elbow’ can be seen at either $k=3$ or $k=4$. Although the highest values of the CH index and the Silhouette coefficient are not observed at $k=4$, the corresponding values at $k=4$ are somehow comparable being at the top three highest values under these indices. For the gap statistic criterion, although a good choice can be $k=5$, the gap statistic at $k=4$ is comparable.

The two-component (PCA) representation of this clustering is presented in Figure 6.

In [22]:
kmeans = KMeans(n_clusters=4, random_state=1337)
kmeans.fit_predict(X_norm);
kmeans_3 = KMeans(n_clusters=3, random_state=1337)
kmeans_3.fit_predict(X_norm);
kmeans_5 = KMeans(n_clusters=5, random_state=1337)
kmeans_5.fit_predict(X_norm);
In [23]:
cluster_color = {1:'#236E96',
                 2:'#F3872F', 
                 3:'#EC5578',
                 4:'#FFD700'}
sns.scatterplot(X_new[:,0], X_new[:,1],
                hue=(kmeans.labels_)+1, palette=cluster_color);

plt.xlabel('PCA 1');
plt.ylabel('PCA 2');

Figure 6. Two-component (PCA) representation of $4$-means clustering

In [24]:
# Save results
results = pd.Series(kmeans.labels_)
results_3 = pd.Series(kmeans_3.labels_)
results_5 = pd.Series(kmeans_5.labels_)
In [25]:
df_clean['label'] = results 
df_clean['label'] = (df_clean['label'] + 1).astype(str)
df_clean['label'] = df_clean['label'].replace({'1':'2', '2':'1',
                                               '4':'3', '3':'4'})

df_clean['label_3'] = results_3 
df_clean['label_3'] = (df_clean['label_3'] + 1).astype(str)

df_clean['label_5'] = results_5
df_clean['label_5'] = (df_clean['label_5'] + 1).astype(str)
INSIGHTS

The application of unsupervised clustering technique to the 2,375 Makati restaurants listed in Zomato resulted to 4 clusters with Cluster 2 dominating based on count. The distribution of the clusters based on count is shown in Figure 7.

In [26]:
tempdf = df_clean.groupby(['label']).size().reset_index().rename(columns = {0:'count'})

tempdf['total']= tempdf['count'].sum()
tempdf
tempdf['percent']= tempdf['count']/tempdf['total']*100
fig, ax = plt.subplots(figsize=(8, 5)) 

sns.barplot(x='label',y='percent',data=tempdf, color = '#E23744');
plt.ylabel('% of restaurants')
plt.xlabel('Cluster')

small = 13

plt.rc('font', size= small)          # controls default text sizes
plt.rc('axes', titlesize= small)     # fontsize of the axes title
plt.rc('axes', labelsize= small)     # fontsize of the axes title
plt.rc('xtick', labelsize= small)    # fontsize of the tick labels
plt.rc('ytick', labelsize= small)    # fontsize of the tick labels
plt.rc('legend', fontsize= small)    # legend fontsize
plt.rc('figure', titlesize= small) 

Figure 7. Proportion of Restaurants per Cluster

Restaurant Cluster ($k$ = 4)

Looking closely at the features of each cluster such as price, ratings, cuisines offered and amenities, we can label each cluster as:

  • Cluster 1: Quick Bites
  • Cluster 2: Fast Food
  • Cluster 3: Casual Dining
  • Cluster 4: Full-Servie Restaurants

Characteristics of each cluster is discussed in the succeeding sections. To get an idea of how restaurants are dispersed around Makati, refer to Figure 8.

In [27]:
viz = df_clean.copy()
viz = viz.rename(columns={'location.latitude':'latitude',
                          'location.longitude':'longitude'})

mapbox = open("mapbox_key.txt").read() # you need your own token

main = folium.Map([14.5547, 121.0244], zoom_start=14, tiles=None)

main.add_child(add_cluster(viz[viz['label'] == '1'],'Cluster 1', '#236E96'))
main.add_child(add_cluster(viz[viz['label'] == '2'],'Cluster 2', '#F3872F'))
main.add_child(add_cluster(viz[viz['label'] == '3'],'Cluster 3', '#EC5578'))
main.add_child(add_cluster(viz[viz['label'] == '4'],'Cluster 4', '#FFD700'))

folium.TileLayer(tiles=mapbox, attr='Mapbox attribution',
                 name='Cluster').add_to(main)
folium.LayerControl(collapsed=False).add_to(main)
main
589it [00:00, 10790.37it/s]
843it [00:00, 10903.37it/s]
708it [00:00, 10775.39it/s]
235it [00:00, 10673.45it/s]
Out[27]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Figure 8. Location of restaurants color coded by their cluster

In [28]:
with open('misc/barangays-municity-ph137602000.0.1.json', 'r') as f:
    f = f.read()
data = json.loads(f)
data['features'] = [barangay for barangay in data['features'] 
                    if barangay['properties']['ADM4_EN'] not in
                    ['Guadalupe Viejo', 'Kasilawan', 'Rizal', 'Singkamas',
                    'Post Proper Northside', 'Post Proper Southside']]
In [29]:
barangay = pd.read_sql('SELECT * from address', conn)

viz = df_clean.copy()
viz = viz.merge(barangay, left_on = 'id', right_on = 'id')
viz = viz.groupby(['barangay', 'label']).size().unstack().replace(np.nan, 0)
viz['assigned_cluster'] = viz.idxmax(axis=1)
viz = viz.reset_index()[['barangay', 'assigned_cluster']]

viz_2 = (df.merge(barangay,
                  left_on='id',
                  right_on='id')[['id', 'location.latitude', 'location.longitude', 'barangay']]
           .merge(viz, left_on ='barangay', right_on = 'barangay')
           .rename(columns={'location.latitude':'latitude',
                          'location.longitude':'longitude'}))
In [30]:
style = {'1':{'fillColor': '#236E96', 'color': '#236E96'},
         '2':{'fillColor': '#F3872F', 'color': '#F3872F'},
         '3':{'fillColor': '#EC5578', 'color': '#EC5578'},
         '4':{'fillColor': '#FFD700', 'color': '#FFD700'}}

Drilling further on the clusters and location, we’ve identified the dominant cluster per barangay as plotted in Figure 9. Cluster 2 consistently covers majority of the barangays while there are no barangays that are dominated by Cluster 4.

In [31]:
lat = 14.5547
lon = 121.0444
main = folium.Map(
    location=[lat, lon],
    zoom_start=13, 
    tiles=None)

for barangay in data['features']:
    extra = data.copy()
    group = barangay['properties']['ADM4_EN']
    cluster = viz_2[viz_2['barangay']==group]['assigned_cluster'].iloc[0]
    barangay['properties']['cluster'] = f'<b> Cluster {cluster}  </b>'
    extra['features'] = [barangay]
    feature_group = folium.FeatureGroup(group)
    style_color = style[viz_2[viz_2['barangay']==group]['assigned_cluster'].iloc[0]],
    choropleth = folium.Choropleth(extra, key_on= group, fill_color= style_color[0]['fillColor']).add_to(main)
    choropleth.geojson.add_child(folium.features.GeoJsonTooltip(['ADM4_EN', 'cluster'], labels=False))

folium.TileLayer(tiles=mapbox, attr='Mapbox attribution',
                 name='Barangays').add_to(main)
folium.LayerControl(False).add_to(main)
main
Out[31]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Figure 9. Dominant cluster per barangay

Cluster Analysis

In this subsection, we used the reviews of restaurants to be able to further understand the different clusters.

In [32]:
def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(0, 60)


def plot_wc(df, cluster):
    masks = {
        '2': np.array(Image.open("./icons/chicken.png")),
        '1': np.array(Image.open("./icons/milk-tea.png")),
        '4': np.array(Image.open("./icons/serving-platter.png")),
        '3': np.array(Image.open("./icons/plate.png")),
    }
    wc = WordCloud(
        background_color="rgba(255, 255, 255, 0)",
        mode="RGB",  # RGBA for contours?
        stopwords=stopwords,
        width=500,
        height=500,
        mask=masks[cluster],
        font_path='Tryst-Regular.otf',
        color_func=grey_color_func,
    )
    corpus = df[df['label'] == cluster]['review'].to_list()
    wc.generate(' '.join(corpus))
    fig, ax = plt.subplots(1, 2, dpi=90, figsize=(10, 4))
    ax[0].imshow(wc, interpolation='bilinear')
    ax[0].axis('off')
    ax[1].set_title('Most common words in cluster')
    top_words = pd.DataFrame(reversed(list(
        wc.words_.items())[:20]),
        columns=['Top Words', 'Weight']
    ).set_index('Top Words').plot(kind='barh',
                                  color='#cb202d', width=.8, ax=ax[1])
    fig.tight_layout()

Cluster 1 - The Quick Bites

The cheapest cluster. It is dominated by on-the-go establishments – food cart, kiosks, bakeries, beverage shops and dessert parlors. Popular finger food establishments such as Potato Corner, Turks, Dunkin Donuts and Gong-cha can be found in this cluster. Since restaurants in this cluster mostly offers food that are quick to acquire, easy to eat, and frequently consumed, customers do not bother to leave comments resulting to low number of ratings and few reviews.

In [33]:
df_wc = pd.read_sql('SELECT * from reviews',
                    conn).merge(df_clean[['id', 'label']])
In [34]:
small = 8

plt.rc('font', size= small)          # controls default text sizes
plt.rc('axes', titlesize= small)     # fontsize of the axes title
plt.rc('axes', labelsize= small)     # fontsize of the axes title
plt.rc('xtick', labelsize= small)    # fontsize of the tick labels
plt.rc('ytick', labelsize= small)    # fontsize of the tick labels
plt.rc('legend', fontsize= small)    # legend fontsize
plt.rc('figure', titlesize= small)  # fontsize of the figure title
In [35]:
stopwords = set(STOPWORDS)
stopwords.update(["place", "food", "restaurant", "one", "will",
                  "Makati", "ha", "na", "branch", "good",
                  'really', 'itâ','ordered', 'order'])

plot_wc(df_wc, '1')

Figure 10. Word cloud for the Quick Bites cluster. From the top words, it can be inferred that this cluster are mostly composed of beverage shops, cafes, kiosks, and bakeries.

Cluster 2 - The Fast Food

Cluster 2 has the most number of restaurants among the 4 clusters. Fast food and coffee chains such as Jollibee, McDonalds, KFC, Starbucks, Coffee Bean and Tim Hortons makes up majority of the cluster. Quick service restaurants offer meals, but menus are quite limited. Usually, food is available as take-out though limited seats are provided as well.

In [36]:
plot_wc(df_wc, '2')

Figure 11. Word cloud for the Fast Food cluster. From the top words, it can be inferred that this cluster are mostly composed of cafes, and fast-foods.

Cluster 3 - The Casual Dining

Cluster 3 are casual food places good for the gathering of friends and family. Restaurants like Max, Shakey’s, Army Navy and Yellow Cab are among those who are members of this cluster. Casual dining restaurants are priced twice as fast-food, but they offer menus with a wider range of choices and cuisines. Additionally, most casual dining restaurants offer alcohol and accepts credit card. Ambiance is good in these restaurants which encouraged customers to share photos and leave reviews and ratings.

In [37]:
plot_wc(df_wc, '3')

Figure 12. Word cloud for the Fast Food cluster. From the top words, it can be inferred that this cluster are mostly composed of restaurants that offer comfort food such as chicken, pizza, and ramen.

Cluster 4 - The Full-Service Restaurants

High-end restaurants can be found in Cluster 4 with meals costing as much as Php11,000 for two. These are always sit-down restaurants with fancier menus and waiter service. Service and customer experience are equally important with food offerings to these types of establishments. Customers don’t just talk about the food, but the great service and the staff as well resulting in more attached customers. This is evidenced by the number of comments and pictures shared by the customers.

In [38]:
plot_wc(df_wc, '4')

Figure 13. Word cloud for the Fast Food cluster. From the top words, it can be inferred that this cluster are mostly composed of high-end restaurants.

Sensitivity Analysis

While it was concluded that grouping restaurants in Makati into $ k=4$ is the best choice, sensitivity analysis was performed to test the robustness of the results gathered.

Summarized below are the features of restaurant clusters for $k=3$ (Table 5) and $k=5$ (Table 6). It is noted that as the cluster count is decreased, the Quick Bites and Fast Food clusters combines into only one cluster with Casual and Full-Service clusters staying almost the same. On the other hand, clustering the restaurants into 5 resulted to splitting of the Casual and Full-Service clusters into 3. As such, it can be deduced that there are restaurants sharing common characteristics of Casual and Full-Service group, and there are subgroups of Full-Service restaurants.

Table 5. Summary of cluster attributes $(k=3)$

Feature Cluster 1 Cluster 2 Cluster 3
Restaurant Count 1,373 738 264
Average Cost for Two ₱ 341 ₱ 880 ₱ 1,831
Aggregate Rating 1.73/5 3.11/5 3.68/5
Average Count of Photos by Customers 21 113 285
Average Number Reviews 9 39 76
Table Reservation Availability Yes Yes Yes

Table 6. Summary of cluster attributes $(k=5)$

Feature Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
Restaurant Count 582 825 611 280 77
Average Cost for Two ₱ 254 ₱ 413 ₱ 864 ₱ 1,341 ₱ 2720
Aggregate Rating 1.05/5 2.28/5 2.95/5 3.82/5 3.5/5
Average Count of Photos by Customers 11 33 80 329 190
Average Number Reviews 5 13 27 97 37
Table Reservation Availability No Limited Limited Yes Yes
SUMMARY AND CONCLUSIONS

Using unsupervised clustering, we were able to find clusters of restaurants in Makati city using their feature. With $k$-means clustering, we obtained optimal value at $k = 4$ and determined the following clusters: Quick Bites, Fast Food, Casual Dining, and Full-Service. Overall, price and ratings are the features that clearly sets apart each of the four clusters. However, there are also a few differences based on other features and offerings such as cuisines offered and establishment type. A summary of the attributes of each cluster is presented in Table 7. The result of this study can be of help to customers when deciding where to eat, business owners when deciding location for business development, local government units, and Zomato.

Table 7. Summary of cluster attributes

Feature Cluster 1 Cluster 2 Cluster 3 Cluster 4
Restaurant Count 589 843 708 235
Average Cost for Two ₱ 253.16 ₱ 421.08 ₱ 917.66 ₱ 1,902.76
Aggregate Rating 1.05/5 2.3/5 3.14/5 3.64/5
Average Count of Photos by Customers 11 35 125 278
Average Number Reviews 5 14 41 74
Table Reservation Availability No Limited Yes Yes
Cuisines Offered Filipino, American, Chinese, Japanese, Korean Japanese, Filipino, Asian, Korean, Chinese Filipino, Japanese, American, Chinese, Korean European, Japanese, American, Asian, Italian
Establishment Types Bakery, Beverage Shop, Dessert Parlour, Kiosk Cafe, Food Court, Kiosk, Quick Bites Bar, Casual Dining, Lounge Fine Dining, Club, Wine Bar, Lounge
Top Highlights Cash, No Alcohol Available, Takeaway Available, Gcash Accepted, Table booking not available Cash, No Alcohol Available, Restroom Available, Wifi, Has power Outlets, Table booking not available Cash, Restroom Available, Table Booking Recommended, Credit Card, Beer, Wifi, Fullbar Table Booking Recommended, Fullbar, Credit Card, Wifi, Restroom Available, Beer
RECOMMENDATIONS

The following points can be considered in future research related to this work:

  1. Expand the covered location to include restaurants in Metro Manila. In addition, other variables may be included such as:
    • Performance of restaurants such as sales and transaction metrics
    • Distance of restaurants from amenities
    • Foot traffic
    • Market size and segment
  2. Perform comparative analysis of how the clusters were affected post-Covid pandemic.
  3. Perform supervised learning to be able to predict ratings of the restaurants and determine important features which can be of value for business owners.
ACKNOWLEDGEMENT
  • Prof. Christian M. Alis for the mentorship and valuable comments
  • Prof. Sebastian Ibañez and Prof. Gino Borja for their inputs
REFERENCES