# Import all libraries

import pandas as pd
import numpy as np
from pathlib import Path
import os ,glob
import sqlite3
import folium
import gower

from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.cluster import AgglomerativeClustering 

from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster
from folium.plugins import MarkerCluster
from folium.features import CustomIcon

from tqdm import tqdm
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


import random
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import clone
from sklearn.metrics import calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean, cityblock
from prince import MFA

from IPython.core.display import HTML
import pprint

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
sns.set_style("white")

pp = pprint.PrettyPrinter(indent=4, width=100)


HTML('''
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}


</style>

<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')

def clean_data(df):
    """ Return cleaned data."""
    
    df1 = df.copy()
    
    
    # MULTI-CUISINE
    df1['count_cuisine'] = df1.cuisines.str.split(',').apply(lambda x: np.size(x))
    df1['multi_cuisine'] = df1['count_cuisine'].apply(lambda x: 1 if x>1 else 0)
    
    
    # df1['cuisines'] = df.cuisines.str.split(',', n=0, expand = True)
    #           .drop(columns=[1,2,3,4,5,6]))
    # df1.drop(columns='cusines', inplace=True)

    # add new column if a restaurant has a rating or not
    
    df1['has_rating'] = df1['user_rating.aggregate_rating'].apply(lambda x: 1 if x>1 else 0) 
    
    # one-hot encode cuisines
    space_strip = lambda x : 'cuisine_' + x.strip()
    df1.cuisines = df1.cuisines.apply(lambda x : 
                                      list(map(space_strip,
                                               list(set(x.split(','))))))
    df1 = df1.drop('cuisines', 1).join(df1.cuisines.str.join('|')
                                          .str.get_dummies())
    
    # one-hot encode highlights
    hlight_strip = (lambda x : 'highlight_' + x.strip().lstrip("'")
                                               .rstrip("'"))
    df1.highlights = df1.highlights.apply(lambda x : 
                                          list(map(hlight_strip, 
                                                   x[1:-1].split(","))))
    df1 = df1.drop('highlights', 1).join(df1.highlights.str.join('|')
                                            .str.get_dummies())
    
    # one-hot encode establishment
    est_strip = (lambda x : 'establishment_' + x.strip().lstrip("'")
                                                .rstrip("'"))
    df1.establishment = df1.establishment.apply(lambda x : 
                                                list(map(est_strip, 
                                                         x[1:-1].split(","))))
    df1 = df1.drop('establishment', 1).join(df1.establishment.str.join('|')
                                            .str.get_dummies())
    
    return df1



def plot_clusters(X, ys):
    """Plot clusters given the design matrix and cluster labels"""
    k_max = len(ys) + 1
    k_mid = k_max//2 + 2
    fig, ax = plt.subplots(2, k_max//2, dpi=150, sharex=True, sharey=True, 
                           figsize=(10,5))
    for k,y in zip(range(2, k_max+1), ys):
        if k < k_mid:
            ax[0][k%k_mid-2].scatter(*zip(*X), c=y, cmap = 'Set3',
                                     s=5, alpha=0.9)
            ax[0][k%k_mid-2].set_title('$k=%d$'%k)
            
        else:
            ax[1][k%k_mid].scatter(*zip(*X), c=y, cmap = 'Set3',
                                   s=5, alpha=0.9)
            ax[1][k%k_mid].set_title('$k=%d$'%k)

    return ax


def pooled_within_ssd(X, y, centroids, dist):
    """Compute pooled within-cluster sum of squares around the cluster mean
    
    Parameters
    ----------
    X : array
        Design matrix with each row corresponding to a point
    y : array
        Class label of each point
    centroids : array
        Number of pairs to sample
    dist : callable
        Distance between two points. It should accept two arrays, each 
        corresponding to the coordinates of each point
        
    Returns
    -------
    float
        Pooled within-cluster sum of squares around the cluster mean
    """

    return sum(dist(x_i, centroids[y_i])**2/(2*(y == y_i).sum())
               for x_i, y_i in zip(X, y))


def gap_statistic(X, y, centroids, dist, b, clusterer, random_state=None):
    """Compute the gap statistic
    
    Parameters
    ----------
    X : array
        Design matrix with each row corresponding to a point
    y : array
        Class label of each point
    centroids : array
        Number of pairs to sample
    dist : callable
        Distance between two points. It should accept two arrays, each 
        corresponding to the coordinates of each point
    b : int
        Number of realizations for the reference distribution
    clusterer : KMeans
        Clusterer object that will be used for clustering the reference 
        realizations
    random_state : int, default=None
        Determines random number generation for realizations
        
    Returns
    -------
    gs : float
        Gap statistic
    gs_std : float
        Standard deviation of gap statistic
    """
    
    rng = np.random.default_rng(random_state)
    minimum = X.min(0)
    maximum = X.max(0)

    log_wk = []
    for i in range(b):
        clusterer1 = clone(clusterer)
        clusterer1.set_params(n_clusters=len(centroids))
        # drawing from a uniform distribution
        X_ = rng.uniform(minimum, maximum, size=X.shape)
        # fitting the drawn values
        y_ = clusterer1.fit_predict(X_)
        log_wk.append(np.log(pooled_within_ssd(X_, y_, 
                                               clusterer1.cluster_centers_, 
                                               dist)))
    
    return [np.mean(log_wk - np.log(pooled_within_ssd(X, y, centroids, 
                                                      dist))),
            np.std(log_wk - np.log(pooled_within_ssd(X, y, centroids, dist)))]


def cluster_range(X, clusterer, k_start, k_stop, actual=None):
    """Perform k-Means clustering"""
    ys = []
    inertias = []
    chs = []
    scs = []
    gss = []
    gssds = []
    ps = []
    amis = []
    ars = []
    for k in range(k_start, k_stop+1):
        clusterer_k = clone(clusterer)
        clusterer_k.set_params(n_clusters=k)
        y = clusterer_k.fit_predict(X)
        ys.append(y)
        inertias.append(clusterer_k.inertia_)
        try:
            chs.append(calinski_harabasz_score(X, y))
        except:
            chs.append(np.nan)
        try:
            scs.append(silhouette_score(X, y))
        except:
            scs.append(np.nan)
        if actual is not None:
            ps.append(purity(actual, y))
            amis.append(adjusted_mutual_info_score(actual, y))
            ars.append(adjusted_rand_score(actual, y))
        gs = gap_statistic(X, y, clusterer_k.cluster_centers_, 
                                 euclidean, 5, 
                                 clone(clusterer).set_params(n_clusters=k), 
                                 random_state=1337)
        gss.append(gs[0])
        gssds.append(gs[1])
    
    return_dict = {'ys' : ys, 'inertias' : inertias, 'chs' : chs,
                   'gss' : gss, 'gssds' : gssds, 'scs' : scs}
    
    if actual is not None:
        return_dict['ps'] = ps
        return_dict['amis'] = amis
        return_dict['ars'] = ars
    return return_dict


def plot_internal(inertias, chs, scs, gss, gssds):
    """Plot internal validation values."""
    fig, ax = plt.subplots(4,1,dpi=100, figsize=(8,12))
    plt.subplots_adjust(hspace=.3)
    ks = np.arange(2, len(inertias)+2)
    ax[0].plot(ks, inertias, '-o', label='SSE', c='#E23744')
    ax[1].plot(ks, chs, '-o', label='CH', c='#E23744')
    ax[0].set_xlabel('$k$')
    ax[1].set_xlabel('$k$')
    ax[2].plot(ks, gss, '-o', label='Gap statistic', color = '#E23744')
    ax[3].plot(ks, scs, '-o', label='Silhouette coefficient', c='#E23744')
    ax[2].set_xlabel('$k$')
    ax[3].set_xlabel('$k$')
    ax[0].legend()
    ax[1].legend()
    ax[2].legend()
    ax[3].legend()
    return ax


def add_cluster(df, group, color):
    """Plot clusters in a map"""
    feature_group = folium.FeatureGroup(group)
    for row in tqdm(df.itertuples()):
        popup_str = f"""<b>{row.name}</b> <br>"""
        popup = folium.Tooltip(popup_str)
        feature_group.add_child(folium.Circle(location=[row.latitude,
                                                        row.longitude],
                                              radius=20, 
                                              color=color,
                                              fill= True,
                                              fill_color = color,
                                              fill_opacity=0.8,
                                              opacity=0.8,
                                              tooltip = popup))
    return feature_group

Clustering Restaurants in Makati

A data-driven approach to food tripping

EXECUTIVE SUMMARY

Implementing a clustering method of restaurants provides a way to create a data-driven approach of choosing where to eat. As such, this study sought to find out whether the said method will show significant results especially when done in the city of Makati. This study as well aimed to determine what the cluster’s features are and how it can help someone know where to satisfy their palettes. Data was gathered via the Zomato API, yielding 2375 data points pertaining to restaurants. Once mined and wrangled, dataset was reduced via multiple factor analysis. After which, the cleaned data was processed via $k$-means clustering. Internal validation methods were used to determine the most appropriate number of clusters with the dataset. After implementing the k-Means clustering method and internal validation checks, 4 clusters were chosen which were labeled as the following: namely, Quick Bites, Fast Food, Casual Dining and Full-Service Restaurants. The top features and top restaurants per cluster were taken to provide a brief idea of themes of each cluster. This can help several stakeholders in providing an unsupervised learning method as a way of making informed decisions on where to go to eat.

INTRODUCTION

Makati City is the hub for business and culture in the Philippines, and it houses the biggest financial district in the country. It has a population of 582,605, which booms to around 4.2M in the day. Recently though, it was notably identified as the gastronomic epicenter of the Philippines. Businesses and owners have taken advantage of this fact by building more restaurants and food places. As a result, various types of these food places have sprouted everywhere so much that it’s hard to determine which one is which. Because of this, people often have a hard time choosing where to fill their palettes. This study aims to eliminate that by having a data driven approach on choosing where to eat. Thus, we sought to answer the following question: What clusters would emerge based on the features of restaurants in Makati?

BUSINESS VALUE

In doing this study, 4 major stakeholders were considered so we could determine how beneficial this study is for their case, and these are the customers, restaurant owners, policymakers, and Zomato.

For Customers. Customers in Makati often have the problem of deciding where to eat because of the plethora of restaurants to choose from. Since this study clusters restaurants based on some latent features like price and ratings, this can at least help customers drill down their choices.

For Business Owners. Many factors come into play when deciding on where to put your first or next business venture, and this is especially true when it comes to opening restaurants. It’s not just location that has to be taken into account. Certain factors like competition in an area, density of restaurants in a location, and so on, can help decide if a given location is optimal. This study can aide the decision of business owners as they can give them information about a certain cluster. Also, for current restaurant owners, this can help them improve their services by looking at the latent features of the clusters and seeing which features affect either the ratings or customer feedback of their restaurant.

For Policymakers. When clusters are determined, policymakers can have a look at these and study the density and the competition in that area. This can lead to some changes in city-wide developments in terms of improving traffic, tax revenue collection, etc.

For Zomato. Zomato’s current recommendation framework works by suggesting the best restaurants based on the type of food that people prefer. If they can’t decide, they can simply type in the city and Zomato will give restaurant suggestions based on ratings. While it currently is doing the job for their platform, this study can give them an idea of how unsupervised learning can augment their current framework.

METHODOLOGY

To be able to identify clusters of restaurants in Makati city, a total of 2,375 restaurants were extracted using the Zomato API (https://developers.zomato.com/api). The general workflow for clustering the restaurants as shown in Figure 1 involves the following steps:

Data Extraction
Data Preprocessing
Data Storage
Exploratory Data Analysis
Feature Extraction via One-Hot Encoding
Dimensionality Reduction using Multiple Factor Analysis (MFA)
Unsupervised clustering using $k$-Means algorithm

Each step of the workflow will be discussed in the succeeding sections.

Figure 1. General workflow for clustering Zomato restaurants in Makati City¶

1. Data Extraction¶

Restaurants in Makati were extracted using the Zomato API (https://developers.zomato.com/api) via web scraping tools, specifically Python’s requests module. The process involved extracting the restaurant features and reviews. Initial cleaning was also implemented to remove irrelevant columns. There are 3 csv files extracted from this step, zomato_restaurantcs.csv, zomato_reviews.csv, and zomato_address.csv.

The source code for scraping the restaurants is in Appendix A.

2. Data Preprocessing¶

Data preprocessing was implemented on the acquired restaurants data. The preprocessing involves:

Restaurants: removing restaurants that are out-of-bounds and have missing data (i.e. outside Makati city, missing location.latidue and location.longitdue, and missing establishment).
Address: using spatial join to acquire barangay per restaurant

df = pd.read_csv('zomato_restaurants.csv')

# Not in Makati City
df = df[(df['location.latitude'] != 0) & (df['location.longitude'] != 0)]
# empty data
df = df.set_index('id').drop([19234265, 19043191.0, 19307546.0, 6305044.0, 6308954.0,
                              6312056.0, 18153422.0, 19306478.0, 19312802.0,
                              19110184.0, 19110323.0]).reset_index()

df_barangay = pd.read_csv('zomato_address.csv')[['id', 'barangay']]
df_barangay = df_barangay[df_barangay['id'].isin(df['id'].to_list())]

df_reviews = pd.read_csv('zomato_reviews.csv')

3. Data Storage¶

The extracted datasets were correponding to 2,375 restaurants were stored in a local sqlite database. A total of 3 tables were created: restaurants, reviews, and address. Table 1 shows the data description of all data fields used in the study.

file = 'zomato.db'
conn = sqlite3.connect(file)
df.to_sql('restaurants', conn, if_exists='replace', index=False)

df_reviews.to_sql('reviews', conn, if_exists='replace', index=False)

df_barangay.to_sql('address', conn, if_exists='replace', index=False)

Table 1. Data Description¶

Data Field	Description	Table
ID	Unique ID of the restaurant	All
Name	Name of the restaurant	restaurants
Latitude	Latitude value of the location of the restaurant	restaurants
Longitude	Longitude value of the location of the restaurant	restaurants
Average Cost for Two	Cost of food for two persons	restaurants
Price Range	Price group the restaurant belongs to\, 1 as the cheapest and 4 as the most expensive	restaurants
Photos Count	Count of food/restaurant photos shared by customers	restaurants
Rating	Over-all rating of the restaurant	restaurants
Has Rating	Whether restaurant is rated or not	restaurants
Votes	Count of ratings given	restaurants
Cuisines	List of cuisines offered	restaurants
Cusine Count	Total number of cuisines offered	restaurants
Multi-cuisine	Whether restaurant has offers more than one cuisine or not	restaurants
Highlights	List of features/amenities available	restaurants
Has Table Reservation	Whether restaurant allows table reservation or not	restaurants
Has Table Booking	Whether restaurant allows table booking or not	restaurants
Reviews Count	Count of reviews left by customers	restaurants
Reviews	List of 5 most recent reviews for the restaurant	reviews
Barangay	Barangay where the restaurant is located	address

Table 2. First 5 rows of `restaurants` table¶

pd.read_sql('SELECT * from restaurants', conn).head(5)

Table 3. First 5 rows of `reviews` table¶

pd.read_sql('SELECT * from reviews', conn).head(5)

Table 4. First 5 rows of `address` table¶

pd.read_sql('SELECT * from address', conn).head(5)

4. Exploratory Data Analysis¶

The dataset extracted from Zomato is ample and with sufficient information about restaurants in Makati city. An important step for understanding what is possible to do with the dataset was to perform exploratory data analysis by understanding the different features.

Figure 2 shows the location of the different restaurants in Makati city. It can be seen how these restaurants tend to agglomerate in busy areas such as Poblacion, and the shopping malls in Ayala. Additionally, different areas are composed of restaurants with varying price range and user rating.

viz = pd.read_sql('SELECT * from restaurants', conn)
token = open("mapbox_token.txt").read() # you need your own token

fig = px.scatter_mapbox(viz,
                        lat="location.latitude",
                        lon="location.longitude",
                        size="average_cost_for_two",
                        color="user_rating.aggregate_rating",
                        color_continuous_scale=px.colors.sequential.thermal,
                        hover_name = 'name',
                        hover_data = {'average_cost_for_two': True,
                                      'user_rating.aggregate_rating': True,
                                      'photo_count': True,
                                      'establishment': True, 
                                      'highlights': False,
                                      'cuisines': True,
                                      'location.latitude':False,
                                      'location.longitude':False},
                       labels = {'user_rating.aggregate_rating': 'rating'})

fig.update_layout(
    mapbox = {
        'accesstoken': token,
        'style': "streets", 'zoom': 12},
    showlegend = False)

fig.show(renderer = 'notebook')

Figure 2. Restaurants in Makati City (size of bubble is based on `average_price_for_two` and color based on `user_rating.aggregate_rating`).¶

Figure 3 shows the varying cuisines that restaurants in Makati offer with Filipino as the top cuisine in terms of number of restaurants/food places. Additionally, the most popular establishments are Casual Dining, Quick Bites, and Beverages Shops in terms of number of restaurants/food places.

viz = pd.read_sql('SELECT * from restaurants', conn)
viz['cuisines'] = (viz.cuisines.str.split(',', n=0, expand=True)
                      .drop(columns=[1, 2, 3, 4, 5, 6]))
viz['establishment'] = viz['establishment'].astype(
    str).str.replace('\[|\]|\'', '')
viz['establishment'] = viz['establishment'].replace(
    to_replace='establishment_', value='', regex=True)


small = 10

plt.rc('font', size=small)          # controls default text sizes
plt.rc('axes', titlesize=small)     # fontsize of the axes title
plt.rc('axes', labelsize=small)     # fontsize of the axes title
plt.rc('xtick', labelsize=small)    # fontsize of the tick labels
plt.rc('ytick', labelsize=small)    # fontsize of the tick labels
plt.rc('legend', fontsize=small)    # legend fontsize
plt.rc('figure', titlesize=small)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 5))

pop_cuisines = viz.groupby(['cuisines']).count(
)[['id']].sort_values(by='id', ascending=False)[:10]
                                                
pop_establishments = viz.groupby(['establishment']).count()[
    ['id']].sort_values(by='id', ascending=False)[:10]

sns.barplot(x='id', y=pop_cuisines.index,
            data=pop_cuisines,
            color='#E23744', ax=ax1)
ax1.set(xlabel='Number of Restaurants', ylabel='Primary Cuisine')

sns.barplot(x='id', y= pop_establishments.index, data=pop_establishments,
            color='#E23744', ax=ax2)
ax2.set(xlabel='Number of Restaurants', ylabel='Establishments')

fig.tight_layout(pad=3.0)

Figure 3. Top 10 Primary Cuisines and Establishment in terms of number of restaurants in Makati City¶

5. Feature Extraction¶

Several features that could improve the performance of the clustering method were processed and derived. Specifically, feature extraction was based on the original features - cuisine, highlight, establishment and user_rating.aggregate_rating.

One-hot encoded features. For the cuisine, highlight and establishment, the values under them are in list form. Hence, these features were one-hot encoded.

Derived features. Three columns were derived from existing features:

has_rating (boolean) - if a restaurant has ratings
count_cuisine (integer) - number of cuisines of a restaurant
multi_cuisine (boolean) - if a restaurant has multiple cuisines

df_clean = clean_data(pd.read_sql("""SELECT * FROM restaurants""", conn))
df_info = df_clean[['id', 'name', 'location.latitude', 'location.longitude']]
df_num = df_clean.drop(['id', 'name',
                        'location.latitude', 'location.longitude'],
                       axis=1)
df_num = df_num[['average_cost_for_two', 'all_reviews_count',
                 'photo_count', 'user_rating.aggregate_rating', 
                 'user_rating.votes', 'count_cuisine', 
                 'is_table_reservation_supported', 
                 'has_table_booking'] + df_num.columns[10:].tolist()]

# convert one-hot encoded features to string
df_num.iloc[:, 7:] = df_num.iloc[:, 7:].astype(int).astype(str)

6. Dimensionality Reduction¶

After performing the feature extraction, the data set had 2,375 rows and 165 columns. So, the dimensions were reduced to improve the accuracy and computational power of the clustering algorithm used.

Multiple factor analysis (MFA) was performed to reduce the number of columns that would be considered in the clustering process. Since MFA works well with grouped features of the same data type, this dimensionality reduction was performed on the one-hot encoded features of cuisine, highlight and establishment. Furthermore, grouping features with the same nature of values is a crucial step in performing MFA. Thus, we grouped these features based on cuisine, highlight and establishment. We chose the top three components which retained the highest variance explained. The selected components were able to retaine $94.54\%$ of the variance explained.

After reducing the dimension, the final design matrix that was used in the clustering process has 2,375 rows and 12 columns.

groups = {'cuisines' : df_num.columns[9:71].tolist(),
          'highlights' : df_num.columns[71:144].tolist(),
          'establishments' : df_num.columns[144:].tolist()}

mfa = MFA(groups=groups, n_components=3, n_iter=1000, random_state=42)

mfa.fit(df_num.iloc[:, 9:])
df_cat_mfa = mfa.transform(df_num.iloc[:, 9:])

df_cat_mfa.columns = ['comp_cuisine', 'comp_estab', 'comp_highlight']
explained_intertia = sum(mfa.explained_inertia_)

7. Unsupervised Clustering¶

To determine the clusters of Zomato-listed Makati restaurants formed based on the features included in this study, $k$-means clustering method was used. Before performing the clustering process, the values in the design matrix were normalized to even out the magnitude of the values in each column. In the clustering process, values from $k=2$ to $k=9$ clusters were considered. To provide a visual representation of the clustering for each value of $k$, a Principal Component Analysis (PCA) was performed. Then, for each value of $k$, the two principal components with the highest variance explained were plotted in a two-dimensional plane to have an overview of the clustering. For a quantitative measure of the appropriate number of clusters, an internal validation was performed. Specifically, the internal validation criteria used were (1) sum of squared distances to the centroids, (2) Calinski-Harabasz index, (3) Silhouette coefficient and (4) Gap statistic.

RESULTS

After performing $k$-means clustering with number of clusters $k=2$ to $k=9$, and plotting the two principal components with the highest variance explained under PCA in a two-dimensional plane, the graphs in Figure 4 were derived.

X = df_num.iloc[:, :9].merge(df_cat_mfa, left_index=True, right_index=True)
X.iloc[:,7:9] = X.iloc[:,7:9].astype(int)

X_norm = normalize(X)

X_new = PCA(n_components=2, random_state=1337).fit_transform(X_norm)
res = cluster_range(X_norm, KMeans(random_state=1337), 2, 9, actual=None)

plot_clusters(X_new, res['ys']);

Figure 4. Two-component (PCA) representation of $k$ clusters¶

Then, the results of the internal validation using the four criteria mentioned are shown in Figure 5.

plot_internal(res['inertias'], res['chs'], res['scs'],
              res['gss'], res['gssds']);

Figure 5. Internal validation results.¶

Looking at the internal validation results, the selected values of $k$ actually show relatively good results. Hence, for interpretability purpose, we chose $k=4$, that is, four clusters of Zomato-listed Makati restaurants. Basing on the SSE criterion, the ‘elbow’ can be seen at either $k=3$ or $k=4$. Although the highest values of the CH index and the Silhouette coefficient are not observed at $k=4$, the corresponding values at $k=4$ are somehow comparable being at the top three highest values under these indices. For the gap statistic criterion, although a good choice can be $k=5$, the gap statistic at $k=4$ is comparable.

The two-component (PCA) representation of this clustering is presented in Figure 6.

kmeans = KMeans(n_clusters=4, random_state=1337)
kmeans.fit_predict(X_norm);
kmeans_3 = KMeans(n_clusters=3, random_state=1337)
kmeans_3.fit_predict(X_norm);
kmeans_5 = KMeans(n_clusters=5, random_state=1337)
kmeans_5.fit_predict(X_norm);

cluster_color = {1:'#236E96',
                 2:'#F3872F', 
                 3:'#EC5578',
                 4:'#FFD700'}
sns.scatterplot(X_new[:,0], X_new[:,1],
                hue=(kmeans.labels_)+1, palette=cluster_color);

plt.xlabel('PCA 1');
plt.ylabel('PCA 2');

Figure 6. Two-component (PCA) representation of $4$-means clustering¶

# Save results
results = pd.Series(kmeans.labels_)
results_3 = pd.Series(kmeans_3.labels_)
results_5 = pd.Series(kmeans_5.labels_)

df_clean['label'] = results 
df_clean['label'] = (df_clean['label'] + 1).astype(str)
df_clean['label'] = df_clean['label'].replace({'1':'2', '2':'1',
                                               '4':'3', '3':'4'})

df_clean['label_3'] = results_3 
df_clean['label_3'] = (df_clean['label_3'] + 1).astype(str)

df_clean['label_5'] = results_5
df_clean['label_5'] = (df_clean['label_5'] + 1).astype(str)

INSIGHTS

The application of unsupervised clustering technique to the 2,375 Makati restaurants listed in Zomato resulted to 4 clusters with Cluster 2 dominating based on count. The distribution of the clusters based on count is shown in Figure 7.

tempdf = df_clean.groupby(['label']).size().reset_index().rename(columns = {0:'count'})

tempdf['total']= tempdf['count'].sum()
tempdf
tempdf['percent']= tempdf['count']/tempdf['total']*100
fig, ax = plt.subplots(figsize=(8, 5)) 

sns.barplot(x='label',y='percent',data=tempdf, color = '#E23744');
plt.ylabel('% of restaurants')
plt.xlabel('Cluster')

small = 13

plt.rc('font', size= small)          # controls default text sizes
plt.rc('axes', titlesize= small)     # fontsize of the axes title
plt.rc('axes', labelsize= small)     # fontsize of the axes title
plt.rc('xtick', labelsize= small)    # fontsize of the tick labels
plt.rc('ytick', labelsize= small)    # fontsize of the tick labels
plt.rc('legend', fontsize= small)    # legend fontsize
plt.rc('figure', titlesize= small)

Figure 7. Proportion of Restaurants per Cluster¶

Restaurant Cluster ($k$ = 4)¶

Looking closely at the features of each cluster such as price, ratings, cuisines offered and amenities, we can label each cluster as:

Cluster 1: Quick Bites
Cluster 2: Fast Food
Cluster 3: Casual Dining
Cluster 4: Full-Servie Restaurants

Characteristics of each cluster is discussed in the succeeding sections. To get an idea of how restaurants are dispersed around Makati, refer to Figure 8.

viz = df_clean.copy()
viz = viz.rename(columns={'location.latitude':'latitude',
                          'location.longitude':'longitude'})

mapbox = open("mapbox_key.txt").read() # you need your own token

main = folium.Map([14.5547, 121.0244], zoom_start=14, tiles=None)

main.add_child(add_cluster(viz[viz['label'] == '1'],'Cluster 1', '#236E96'))
main.add_child(add_cluster(viz[viz['label'] == '2'],'Cluster 2', '#F3872F'))
main.add_child(add_cluster(viz[viz['label'] == '3'],'Cluster 3', '#EC5578'))
main.add_child(add_cluster(viz[viz['label'] == '4'],'Cluster 4', '#FFD700'))

folium.TileLayer(tiles=mapbox, attr='Mapbox attribution',
                 name='Cluster').add_to(main)
folium.LayerControl(collapsed=False).add_to(main)
main

589it [00:00, 10790.37it/s]
843it [00:00, 10903.37it/s]
708it [00:00, 10775.39it/s]
235it [00:00, 10673.45it/s]

Figure 8. Location of restaurants color coded by their cluster¶

with open('misc/barangays-municity-ph137602000.0.1.json', 'r') as f:
    f = f.read()
data = json.loads(f)
data['features'] = [barangay for barangay in data['features'] 
                    if barangay['properties']['ADM4_EN'] not in
                    ['Guadalupe Viejo', 'Kasilawan', 'Rizal', 'Singkamas',
                    'Post Proper Northside', 'Post Proper Southside']]

barangay = pd.read_sql('SELECT * from address', conn)

viz = df_clean.copy()
viz = viz.merge(barangay, left_on = 'id', right_on = 'id')
viz = viz.groupby(['barangay', 'label']).size().unstack().replace(np.nan, 0)
viz['assigned_cluster'] = viz.idxmax(axis=1)
viz = viz.reset_index()[['barangay', 'assigned_cluster']]

viz_2 = (df.merge(barangay,
                  left_on='id',
                  right_on='id')[['id', 'location.latitude', 'location.longitude', 'barangay']]
           .merge(viz, left_on ='barangay', right_on = 'barangay')
           .rename(columns={'location.latitude':'latitude',
                          'location.longitude':'longitude'}))

style = {'1':{'fillColor': '#236E96', 'color': '#236E96'},
         '2':{'fillColor': '#F3872F', 'color': '#F3872F'},
         '3':{'fillColor': '#EC5578', 'color': '#EC5578'},
         '4':{'fillColor': '#FFD700', 'color': '#FFD700'}}

Drilling further on the clusters and location, we’ve identified the dominant cluster per barangay as plotted in Figure 9. Cluster 2 consistently covers majority of the barangays while there are no barangays that are dominated by Cluster 4.

lat = 14.5547
lon = 121.0444
main = folium.Map(
    location=[lat, lon],
    zoom_start=13, 
    tiles=None)

for barangay in data['features']:
    extra = data.copy()
    group = barangay['properties']['ADM4_EN']
    cluster = viz_2[viz_2['barangay']==group]['assigned_cluster'].iloc[0]
    barangay['properties']['cluster'] = f'<b> Cluster {cluster}  </b>'
    extra['features'] = [barangay]
    feature_group = folium.FeatureGroup(group)
    style_color = style[viz_2[viz_2['barangay']==group]['assigned_cluster'].iloc[0]],
    choropleth = folium.Choropleth(extra, key_on= group, fill_color= style_color[0]['fillColor']).add_to(main)
    choropleth.geojson.add_child(folium.features.GeoJsonTooltip(['ADM4_EN', 'cluster'], labels=False))

folium.TileLayer(tiles=mapbox, attr='Mapbox attribution',
                 name='Barangays').add_to(main)
folium.LayerControl(False).add_to(main)
main

Figure 9. Dominant cluster per barangay¶

Cluster Analysis¶

In this subsection, we used the reviews of restaurants to be able to further understand the different clusters.

def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(0, 60)


def plot_wc(df, cluster):
    masks = {
        '2': np.array(Image.open("./icons/chicken.png")),
        '1': np.array(Image.open("./icons/milk-tea.png")),
        '4': np.array(Image.open("./icons/serving-platter.png")),
        '3': np.array(Image.open("./icons/plate.png")),
    }
    wc = WordCloud(
        background_color="rgba(255, 255, 255, 0)",
        mode="RGB",  # RGBA for contours?
        stopwords=stopwords,
        width=500,
        height=500,
        mask=masks[cluster],
        font_path='Tryst-Regular.otf',
        color_func=grey_color_func,
    )
    corpus = df[df['label'] == cluster]['review'].to_list()
    wc.generate(' '.join(corpus))
    fig, ax = plt.subplots(1, 2, dpi=90, figsize=(10, 4))
    ax[0].imshow(wc, interpolation='bilinear')
    ax[0].axis('off')
    ax[1].set_title('Most common words in cluster')
    top_words = pd.DataFrame(reversed(list(
        wc.words_.items())[:20]),
        columns=['Top Words', 'Weight']
    ).set_index('Top Words').plot(kind='barh',
                                  color='#cb202d', width=.8, ax=ax[1])
    fig.tight_layout()

Cluster 1 - The Quick Bites¶

The cheapest cluster. It is dominated by on-the-go establishments – food cart, kiosks, bakeries, beverage shops and dessert parlors. Popular finger food establishments such as Potato Corner, Turks, Dunkin Donuts and Gong-cha can be found in this cluster. Since restaurants in this cluster mostly offers food that are quick to acquire, easy to eat, and frequently consumed, customers do not bother to leave comments resulting to low number of ratings and few reviews.

df_wc = pd.read_sql('SELECT * from reviews',
                    conn).merge(df_clean[['id', 'label']])

small = 8

plt.rc('font', size= small)          # controls default text sizes
plt.rc('axes', titlesize= small)     # fontsize of the axes title
plt.rc('axes', labelsize= small)     # fontsize of the axes title
plt.rc('xtick', labelsize= small)    # fontsize of the tick labels
plt.rc('ytick', labelsize= small)    # fontsize of the tick labels
plt.rc('legend', fontsize= small)    # legend fontsize
plt.rc('figure', titlesize= small)  # fontsize of the figure title

stopwords = set(STOPWORDS)
stopwords.update(["place", "food", "restaurant", "one", "will",
                  "Makati", "ha", "na", "branch", "good",
                  'really', 'itâ','ordered', 'order'])

plot_wc(df_wc, '1')

Figure 10. Word cloud for the Quick Bites cluster. From the top words, it can be inferred that this cluster are mostly composed of beverage shops, cafes, kiosks, and bakeries.¶

Cluster 2 - The Fast Food¶

Cluster 2 has the most number of restaurants among the 4 clusters. Fast food and coffee chains such as Jollibee, McDonalds, KFC, Starbucks, Coffee Bean and Tim Hortons makes up majority of the cluster. Quick service restaurants offer meals, but menus are quite limited. Usually, food is available as take-out though limited seats are provided as well.

plot_wc(df_wc, '2')

Figure 11. Word cloud for the Fast Food cluster. From the top words, it can be inferred that this cluster are mostly composed of cafes, and fast-foods.¶

Cluster 3 - The Casual Dining¶

Cluster 3 are casual food places good for the gathering of friends and family. Restaurants like Max, Shakey’s, Army Navy and Yellow Cab are among those who are members of this cluster. Casual dining restaurants are priced twice as fast-food, but they offer menus with a wider range of choices and cuisines. Additionally, most casual dining restaurants offer alcohol and accepts credit card. Ambiance is good in these restaurants which encouraged customers to share photos and leave reviews and ratings.

plot_wc(df_wc, '3')

Figure 12. Word cloud for the Fast Food cluster. From the top words, it can be inferred that this cluster are mostly composed of restaurants that offer comfort food such as chicken, pizza, and ramen.¶

Cluster 4 - The Full-Service Restaurants¶

High-end restaurants can be found in Cluster 4 with meals costing as much as Php11,000 for two. These are always sit-down restaurants with fancier menus and waiter service. Service and customer experience are equally important with food offerings to these types of establishments. Customers don’t just talk about the food, but the great service and the staff as well resulting in more attached customers. This is evidenced by the number of comments and pictures shared by the customers.

plot_wc(df_wc, '4')

Figure 13. Word cloud for the Fast Food cluster. From the top words, it can be inferred that this cluster are mostly composed of high-end restaurants.¶

Sensitivity Analysis¶

While it was concluded that grouping restaurants in Makati into $ k=4$ is the best choice, sensitivity analysis was performed to test the robustness of the results gathered.

Summarized below are the features of restaurant clusters for $k=3$ (Table 5) and $k=5$ (Table 6). It is noted that as the cluster count is decreased, the Quick Bites and Fast Food clusters combines into only one cluster with Casual and Full-Service clusters staying almost the same. On the other hand, clustering the restaurants into 5 resulted to splitting of the Casual and Full-Service clusters into 3. As such, it can be deduced that there are restaurants sharing common characteristics of Casual and Full-Service group, and there are subgroups of Full-Service restaurants.

Table 5. Summary of cluster attributes $(k=3)$¶

Feature	Cluster 1	Cluster 2	Cluster 3
Restaurant Count	1,373	738	264
Average Cost for Two	₱ 341	₱ 880	₱ 1,831
Aggregate Rating	1.73/5	3.11/5	3.68/5
Average Count of Photos by Customers	21	113	285
Average Number Reviews	9	39	76
Table Reservation Availability	Yes	Yes	Yes

Table 6. Summary of cluster attributes $(k=5)$¶

Feature	Cluster 1	Cluster 2	Cluster 3	Cluster 4	Cluster 5
Restaurant Count	582	825	611	280	77
Average Cost for Two	₱ 254	₱ 413	₱ 864	₱ 1,341	₱ 2720
Aggregate Rating	1.05/5	2.28/5	2.95/5	3.82/5	3.5/5
Average Count of Photos by Customers	11	33	80	329	190
Average Number Reviews	5	13	27	97	37
Table Reservation Availability	No	Limited	Limited	Yes	Yes

SUMMARY AND CONCLUSIONS

Using unsupervised clustering, we were able to find clusters of restaurants in Makati city using their feature. With $k$-means clustering, we obtained optimal value at $k = 4$ and determined the following clusters: Quick Bites, Fast Food, Casual Dining, and Full-Service. Overall, price and ratings are the features that clearly sets apart each of the four clusters. However, there are also a few differences based on other features and offerings such as cuisines offered and establishment type. A summary of the attributes of each cluster is presented in Table 7. The result of this study can be of help to customers when deciding where to eat, business owners when deciding location for business development, local government units, and Zomato.

Table 7. Summary of cluster attributes¶

Feature	Cluster 1	Cluster 2	Cluster 3	Cluster 4
Restaurant Count	589	843	708	235
Average Cost for Two	₱ 253.16	₱ 421.08	₱ 917.66	₱ 1,902.76
Aggregate Rating	1.05/5	2.3/5	3.14/5	3.64/5
Average Count of Photos by Customers	11	35	125	278
Average Number Reviews	5	14	41	74
Table Reservation Availability	No	Limited	Yes	Yes
Cuisines Offered	Filipino, American, Chinese, Japanese, Korean	Japanese, Filipino, Asian, Korean, Chinese	Filipino, Japanese, American, Chinese, Korean	European, Japanese, American, Asian, Italian
Establishment Types	Bakery, Beverage Shop, Dessert Parlour, Kiosk	Cafe, Food Court, Kiosk, Quick Bites	Bar, Casual Dining, Lounge	Fine Dining, Club, Wine Bar, Lounge
Top Highlights	Cash, No Alcohol Available, Takeaway Available, Gcash Accepted, Table booking not available	Cash, No Alcohol Available, Restroom Available, Wifi, Has power Outlets, Table booking not available	Cash, Restroom Available, Table Booking Recommended, Credit Card, Beer, Wifi, Fullbar	Table Booking Recommended, Fullbar, Credit Card, Wifi, Restroom Available, Beer

RECOMMENDATIONS

The following points can be considered in future research related to this work:

Expand the covered location to include restaurants in Metro Manila. In addition, other variables may be included such as:
- Performance of restaurants such as sales and transaction metrics
- Distance of restaurants from amenities
- Foot traffic
- Market size and segment
Perform comparative analysis of how the clusters were affected post-Covid pandemic.
Perform supervised learning to be able to predict ratings of the restaurants and determine important features which can be of value for business owners.

ACKNOWLEDGEMENT

Prof. Christian M. Alis for the mentorship and valuable comments
Prof. Sebastian Ibañez and Prof. Gino Borja for their inputs

REFERENCES

Abdi, H. and Valentin, D. (2007). Multiple Factor Analysis (MFA). Retrieved from: https://personal.utdallas.edu/~herve/Abdi-MFA2007-pretty.pdf
Aggarwal, C. (2015). Data Mining: The Textbook. New York: Springer Publishers
Faeldon, J. (2020). Philippines Administrative Boundaries JSON Maps. Github repository: https://github.com/faeldon/philippines-json-maps/
PhilGIS. (2020). Country Barangays File. Retrieved from: http://philgis.org/country-barangay/country-barangays-file

	id	name	cuisines	average_cost_for_two	price_range	highlights	all_reviews_count	photo_count	establishment	location.latitude	location.longitude	user_rating.aggregate_rating	user_rating.votes
0	6301456.0	Central BBQ Boy Grill	Filipino	800.0	3.0	['Cash', 'Dinner', 'Restroom available', 'Nigh...	11.0	26.0	['Bar']	14.557535	121.005698	3.5	22.0
1	6301475.0	Claudette's	American, Desserts	600.0	2.0	['Lunch', 'Breakfast', 'Dinner', 'Credit Card'...	34.0	90.0	['Casual Dining']	14.559123	121.019683	3.7	73.0
2	6301802.0	Sumo Sam	Japanese	1200.0	3.0	['Lunch', 'Serves Alcohol', 'Credit Card', 'Di...	38.0	129.0	['Casual Dining']	14.565058	121.036706	3.0	89.0
3	6301810.0	Cafe France	Coffee, Bakery	600.0	2.0	['Credit Card', 'Lunch', 'No Alcohol Available...	18.0	66.0	['Casual Dining']	14.550726	121.025261	2.7	31.0
4	6301843.0	Modern China Restaurant	Chinese	1200.0	3.0	['Lunch', 'Credit Card', 'Dinner', 'Cash', 'Wi...	23.0	107.0	['Casual Dining']	14.550892	121.026684	3.9	58.0

	id	review
0	18972245.0	It's a authentic Taiwanese style steak!
1	18972245.0	Ubod ng kunat na steak makunat pa sa ninong mo...
2	18647867.0	Great beer selection! Nice place to hang out a...
3	18647867.0	Beers were of extensive selection. Buy 1 get 1...
4	6313503.0	Good service from the staff. Not sure if true ...

	id	barangay
0	6301456.0	San Isidro
1	6301475.0	Bel-Air
2	6301802.0	Guadalupe Viejo
3	6301810.0	San Lorenzo
4	6301843.0	San Lorenzo