# Import all libraries
import pandas as pd
import numpy as np
from pathlib import Path
import os ,glob
import sqlite3
import folium
import gower
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.cluster import AgglomerativeClustering
from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster
from folium.plugins import MarkerCluster
from folium.features import CustomIcon
from tqdm import tqdm
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import random
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import clone
from sklearn.metrics import calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean, cityblock
from prince import MFA
from IPython.core.display import HTML
import pprint
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
sns.set_style("white")
pp = pprint.PrettyPrinter(indent=4, width=100)
HTML('''
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')
def clean_data(df):
""" Return cleaned data."""
df1 = df.copy()
# MULTI-CUISINE
df1['count_cuisine'] = df1.cuisines.str.split(',').apply(lambda x: np.size(x))
df1['multi_cuisine'] = df1['count_cuisine'].apply(lambda x: 1 if x>1 else 0)
# df1['cuisines'] = df.cuisines.str.split(',', n=0, expand = True)
# .drop(columns=[1,2,3,4,5,6]))
# df1.drop(columns='cusines', inplace=True)
# add new column if a restaurant has a rating or not
df1['has_rating'] = df1['user_rating.aggregate_rating'].apply(lambda x: 1 if x>1 else 0)
# one-hot encode cuisines
space_strip = lambda x : 'cuisine_' + x.strip()
df1.cuisines = df1.cuisines.apply(lambda x :
list(map(space_strip,
list(set(x.split(','))))))
df1 = df1.drop('cuisines', 1).join(df1.cuisines.str.join('|')
.str.get_dummies())
# one-hot encode highlights
hlight_strip = (lambda x : 'highlight_' + x.strip().lstrip("'")
.rstrip("'"))
df1.highlights = df1.highlights.apply(lambda x :
list(map(hlight_strip,
x[1:-1].split(","))))
df1 = df1.drop('highlights', 1).join(df1.highlights.str.join('|')
.str.get_dummies())
# one-hot encode establishment
est_strip = (lambda x : 'establishment_' + x.strip().lstrip("'")
.rstrip("'"))
df1.establishment = df1.establishment.apply(lambda x :
list(map(est_strip,
x[1:-1].split(","))))
df1 = df1.drop('establishment', 1).join(df1.establishment.str.join('|')
.str.get_dummies())
return df1
def plot_clusters(X, ys):
"""Plot clusters given the design matrix and cluster labels"""
k_max = len(ys) + 1
k_mid = k_max//2 + 2
fig, ax = plt.subplots(2, k_max//2, dpi=150, sharex=True, sharey=True,
figsize=(10,5))
for k,y in zip(range(2, k_max+1), ys):
if k < k_mid:
ax[0][k%k_mid-2].scatter(*zip(*X), c=y, cmap = 'Set3',
s=5, alpha=0.9)
ax[0][k%k_mid-2].set_title('$k=%d$'%k)
else:
ax[1][k%k_mid].scatter(*zip(*X), c=y, cmap = 'Set3',
s=5, alpha=0.9)
ax[1][k%k_mid].set_title('$k=%d$'%k)
return ax
def pooled_within_ssd(X, y, centroids, dist):
"""Compute pooled within-cluster sum of squares around the cluster mean
Parameters
----------
X : array
Design matrix with each row corresponding to a point
y : array
Class label of each point
centroids : array
Number of pairs to sample
dist : callable
Distance between two points. It should accept two arrays, each
corresponding to the coordinates of each point
Returns
-------
float
Pooled within-cluster sum of squares around the cluster mean
"""
return sum(dist(x_i, centroids[y_i])**2/(2*(y == y_i).sum())
for x_i, y_i in zip(X, y))
def gap_statistic(X, y, centroids, dist, b, clusterer, random_state=None):
"""Compute the gap statistic
Parameters
----------
X : array
Design matrix with each row corresponding to a point
y : array
Class label of each point
centroids : array
Number of pairs to sample
dist : callable
Distance between two points. It should accept two arrays, each
corresponding to the coordinates of each point
b : int
Number of realizations for the reference distribution
clusterer : KMeans
Clusterer object that will be used for clustering the reference
realizations
random_state : int, default=None
Determines random number generation for realizations
Returns
-------
gs : float
Gap statistic
gs_std : float
Standard deviation of gap statistic
"""
rng = np.random.default_rng(random_state)
minimum = X.min(0)
maximum = X.max(0)
log_wk = []
for i in range(b):
clusterer1 = clone(clusterer)
clusterer1.set_params(n_clusters=len(centroids))
# drawing from a uniform distribution
X_ = rng.uniform(minimum, maximum, size=X.shape)
# fitting the drawn values
y_ = clusterer1.fit_predict(X_)
log_wk.append(np.log(pooled_within_ssd(X_, y_,
clusterer1.cluster_centers_,
dist)))
return [np.mean(log_wk - np.log(pooled_within_ssd(X, y, centroids,
dist))),
np.std(log_wk - np.log(pooled_within_ssd(X, y, centroids, dist)))]
def cluster_range(X, clusterer, k_start, k_stop, actual=None):
"""Perform k-Means clustering"""
ys = []
inertias = []
chs = []
scs = []
gss = []
gssds = []
ps = []
amis = []
ars = []
for k in range(k_start, k_stop+1):
clusterer_k = clone(clusterer)
clusterer_k.set_params(n_clusters=k)
y = clusterer_k.fit_predict(X)
ys.append(y)
inertias.append(clusterer_k.inertia_)
try:
chs.append(calinski_harabasz_score(X, y))
except:
chs.append(np.nan)
try:
scs.append(silhouette_score(X, y))
except:
scs.append(np.nan)
if actual is not None:
ps.append(purity(actual, y))
amis.append(adjusted_mutual_info_score(actual, y))
ars.append(adjusted_rand_score(actual, y))
gs = gap_statistic(X, y, clusterer_k.cluster_centers_,
euclidean, 5,
clone(clusterer).set_params(n_clusters=k),
random_state=1337)
gss.append(gs[0])
gssds.append(gs[1])
return_dict = {'ys' : ys, 'inertias' : inertias, 'chs' : chs,
'gss' : gss, 'gssds' : gssds, 'scs' : scs}
if actual is not None:
return_dict['ps'] = ps
return_dict['amis'] = amis
return_dict['ars'] = ars
return return_dict
def plot_internal(inertias, chs, scs, gss, gssds):
"""Plot internal validation values."""
fig, ax = plt.subplots(4,1,dpi=100, figsize=(8,12))
plt.subplots_adjust(hspace=.3)
ks = np.arange(2, len(inertias)+2)
ax[0].plot(ks, inertias, '-o', label='SSE', c='#E23744')
ax[1].plot(ks, chs, '-o', label='CH', c='#E23744')
ax[0].set_xlabel('$k$')
ax[1].set_xlabel('$k$')
ax[2].plot(ks, gss, '-o', label='Gap statistic', color = '#E23744')
ax[3].plot(ks, scs, '-o', label='Silhouette coefficient', c='#E23744')
ax[2].set_xlabel('$k$')
ax[3].set_xlabel('$k$')
ax[0].legend()
ax[1].legend()
ax[2].legend()
ax[3].legend()
return ax
def add_cluster(df, group, color):
"""Plot clusters in a map"""
feature_group = folium.FeatureGroup(group)
for row in tqdm(df.itertuples()):
popup_str = f"""<b>{row.name}</b> <br>"""
popup = folium.Tooltip(popup_str)
feature_group.add_child(folium.Circle(location=[row.latitude,
row.longitude],
radius=20,
color=color,
fill= True,
fill_color = color,
fill_opacity=0.8,
opacity=0.8,
tooltip = popup))
return feature_group
Clustering Restaurants in Makati
A data-driven approach to food tripping
Implementing a clustering method of restaurants provides a way to create a data-driven approach of choosing where to eat. As such, this study sought to find out whether the said method will show significant results especially when done in the city of Makati. This study as well aimed to determine what the cluster’s features are and how it can help someone know where to satisfy their palettes. Data was gathered via the Zomato API, yielding 2375 data points pertaining to restaurants. Once mined and wrangled, dataset was reduced via multiple factor analysis. After which, the cleaned data was processed via $k$-means clustering. Internal validation methods were used to determine the most appropriate number of clusters with the dataset. After implementing the k-Means clustering method and internal validation checks, 4 clusters were chosen which were labeled as the following: namely, Quick Bites, Fast Food, Casual Dining and Full-Service Restaurants. The top features and top restaurants per cluster were taken to provide a brief idea of themes of each cluster. This can help several stakeholders in providing an unsupervised learning method as a way of making informed decisions on where to go to eat.
Makati City is the hub for business and culture in the Philippines, and it houses the biggest financial district in the country. It has a population of 582,605, which booms to around 4.2M in the day. Recently though, it was notably identified as the gastronomic epicenter of the Philippines. Businesses and owners have taken advantage of this fact by building more restaurants and food places. As a result, various types of these food places have sprouted everywhere so much that it’s hard to determine which one is which. Because of this, people often have a hard time choosing where to fill their palettes. This study aims to eliminate that by having a data driven approach on choosing where to eat. Thus, we sought to answer the following question: What clusters would emerge based on the features of restaurants in Makati?
In doing this study, 4 major stakeholders were considered so we could determine how beneficial this study is for their case, and these are the customers, restaurant owners, policymakers, and Zomato.
To be able to identify clusters of restaurants in Makati city, a total of 2,375 restaurants were extracted using the Zomato API (https://developers.zomato.com/api). The general workflow for clustering the restaurants as shown in Figure 1 involves the following steps:
Each step of the workflow will be discussed in the succeeding sections.

Restaurants in Makati were extracted using the Zomato API (https://developers.zomato.com/api) via web scraping tools, specifically Python’s requests module. The process involved extracting the restaurant features and reviews. Initial cleaning was also implemented to remove irrelevant columns. There are 3 csv files extracted from this step, zomato_restaurantcs.csv, zomato_reviews.csv, and zomato_address.csv.
The source code for scraping the restaurants is in Appendix A.
Data preprocessing was implemented on the acquired restaurants data. The preprocessing involves:
location.latidue and location.longitdue, and missing establishment).df = pd.read_csv('zomato_restaurants.csv')
# Not in Makati City
df = df[(df['location.latitude'] != 0) & (df['location.longitude'] != 0)]
# empty data
df = df.set_index('id').drop([19234265, 19043191.0, 19307546.0, 6305044.0, 6308954.0,
6312056.0, 18153422.0, 19306478.0, 19312802.0,
19110184.0, 19110323.0]).reset_index()
df_barangay = pd.read_csv('zomato_address.csv')[['id', 'barangay']]
df_barangay = df_barangay[df_barangay['id'].isin(df['id'].to_list())]
df_reviews = pd.read_csv('zomato_reviews.csv')
The extracted datasets were correponding to 2,375 restaurants were stored in a local sqlite database. A total of 3 tables were created: restaurants, reviews, and address. Table 1 shows the data description of all data fields used in the study.
file = 'zomato.db'
conn = sqlite3.connect(file)
df.to_sql('restaurants', conn, if_exists='replace', index=False)
df_reviews.to_sql('reviews', conn, if_exists='replace', index=False)
df_barangay.to_sql('address', conn, if_exists='replace', index=False)
| Data Field | Description | Table |
|---|---|---|
| ID | Unique ID of the restaurant | All |
| Name | Name of the restaurant | restaurants |
| Latitude | Latitude value of the location of the restaurant | restaurants |
| Longitude | Longitude value of the location of the restaurant | restaurants |
| Average Cost for Two | Cost of food for two persons | restaurants |
| Price Range | Price group the restaurant belongs to\, 1 as the cheapest and 4 as the most expensive | restaurants |
| Photos Count | Count of food/restaurant photos shared by customers | restaurants |
| Rating | Over-all rating of the restaurant | restaurants |
| Has Rating | Whether restaurant is rated or not | restaurants |
| Votes | Count of ratings given | restaurants |
| Cuisines | List of cuisines offered | restaurants |
| Cusine Count | Total number of cuisines offered | restaurants |
| Multi-cuisine | Whether restaurant has offers more than one cuisine or not | restaurants |
| Highlights | List of features/amenities available | restaurants |
| Has Table Reservation | Whether restaurant allows table reservation or not | restaurants |
| Has Table Booking | Whether restaurant allows table booking or not | restaurants |
| Reviews Count | Count of reviews left by customers | restaurants |
| Reviews | List of 5 most recent reviews for the restaurant | reviews |
| Barangay | Barangay where the restaurant is located | address |
restaurants table¶pd.read_sql('SELECT * from restaurants', conn).head(5)
reviews table¶pd.read_sql('SELECT * from reviews', conn).head(5)
address table¶pd.read_sql('SELECT * from address', conn).head(5)
The dataset extracted from Zomato is ample and with sufficient information about restaurants in Makati city. An important step for understanding what is possible to do with the dataset was to perform exploratory data analysis by understanding the different features.
Figure 2 shows the location of the different restaurants in Makati city. It can be seen how these restaurants tend to agglomerate in busy areas such as Poblacion, and the shopping malls in Ayala. Additionally, different areas are composed of restaurants with varying price range and user rating.
viz = pd.read_sql('SELECT * from restaurants', conn)
token = open("mapbox_token.txt").read() # you need your own token
fig = px.scatter_mapbox(viz,
lat="location.latitude",
lon="location.longitude",
size="average_cost_for_two",
color="user_rating.aggregate_rating",
color_continuous_scale=px.colors.sequential.thermal,
hover_name = 'name',
hover_data = {'average_cost_for_two': True,
'user_rating.aggregate_rating': True,
'photo_count': True,
'establishment': True,
'highlights': False,
'cuisines': True,
'location.latitude':False,
'location.longitude':False},
labels = {'user_rating.aggregate_rating': 'rating'})
fig.update_layout(
mapbox = {
'accesstoken': token,
'style': "streets", 'zoom': 12},
showlegend = False)
fig.show(renderer = 'notebook')
average_price_for_two and color based on user_rating.aggregate_rating).¶Figure 3 shows the varying cuisines that restaurants in Makati offer with Filipino as the top cuisine in terms of number of restaurants/food places. Additionally, the most popular establishments are Casual Dining, Quick Bites, and Beverages Shops in terms of number of restaurants/food places.
viz = pd.read_sql('SELECT * from restaurants', conn)
viz['cuisines'] = (viz.cuisines.str.split(',', n=0, expand=True)
.drop(columns=[1, 2, 3, 4, 5, 6]))
viz['establishment'] = viz['establishment'].astype(
str).str.replace('\[|\]|\'', '')
viz['establishment'] = viz['establishment'].replace(
to_replace='establishment_', value='', regex=True)
small = 10
plt.rc('font', size=small) # controls default text sizes
plt.rc('axes', titlesize=small) # fontsize of the axes title
plt.rc('axes', labelsize=small) # fontsize of the axes title
plt.rc('xtick', labelsize=small) # fontsize of the tick labels
plt.rc('ytick', labelsize=small) # fontsize of the tick labels
plt.rc('legend', fontsize=small) # legend fontsize
plt.rc('figure', titlesize=small)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 5))
pop_cuisines = viz.groupby(['cuisines']).count(
)[['id']].sort_values(by='id', ascending=False)[:10]
pop_establishments = viz.groupby(['establishment']).count()[
['id']].sort_values(by='id', ascending=False)[:10]
sns.barplot(x='id', y=pop_cuisines.index,
data=pop_cuisines,
color='#E23744', ax=ax1)
ax1.set(xlabel='Number of Restaurants', ylabel='Primary Cuisine')
sns.barplot(x='id', y= pop_establishments.index, data=pop_establishments,
color='#E23744', ax=ax2)
ax2.set(xlabel='Number of Restaurants', ylabel='Establishments')
fig.tight_layout(pad=3.0)
Several features that could improve the performance of the clustering method were processed and derived. Specifically, feature extraction was based on the original features - cuisine, highlight, establishment and user_rating.aggregate_rating.
One-hot encoded features. For the cuisine, highlight and establishment, the values under them are in list form. Hence, these features were one-hot encoded.
Derived features. Three columns were derived from existing features:
has_rating (boolean) - if a restaurant has ratings
count_cuisine (integer) - number of cuisines of a restaurant
multi_cuisine (boolean) - if a restaurant has multiple cuisines
df_clean = clean_data(pd.read_sql("""SELECT * FROM restaurants""", conn))
df_info = df_clean[['id', 'name', 'location.latitude', 'location.longitude']]
df_num = df_clean.drop(['id', 'name',
'location.latitude', 'location.longitude'],
axis=1)
df_num = df_num[['average_cost_for_two', 'all_reviews_count',
'photo_count', 'user_rating.aggregate_rating',
'user_rating.votes', 'count_cuisine',
'is_table_reservation_supported',
'has_table_booking'] + df_num.columns[10:].tolist()]
# convert one-hot encoded features to string
df_num.iloc[:, 7:] = df_num.iloc[:, 7:].astype(int).astype(str)
After performing the feature extraction, the data set had 2,375 rows and 165 columns. So, the dimensions were reduced to improve the accuracy and computational power of the clustering algorithm used.
Multiple factor analysis (MFA) was performed to reduce the number of columns that would be considered in the clustering process. Since MFA works well with grouped features of the same data type, this dimensionality reduction was performed on the one-hot encoded features of cuisine, highlight and establishment. Furthermore, grouping features with the same nature of values is a crucial step in performing MFA. Thus, we grouped these features based on cuisine, highlight and establishment. We chose the top three components which retained the highest variance explained. The selected components were able to retaine $94.54\%$ of the variance explained.
After reducing the dimension, the final design matrix that was used in the clustering process has 2,375 rows and 12 columns.
groups = {'cuisines' : df_num.columns[9:71].tolist(),
'highlights' : df_num.columns[71:144].tolist(),
'establishments' : df_num.columns[144:].tolist()}
mfa = MFA(groups=groups, n_components=3, n_iter=1000, random_state=42)
mfa.fit(df_num.iloc[:, 9:])
df_cat_mfa = mfa.transform(df_num.iloc[:, 9:])
df_cat_mfa.columns = ['comp_cuisine', 'comp_estab', 'comp_highlight']
explained_intertia = sum(mfa.explained_inertia_)
To determine the clusters of Zomato-listed Makati restaurants formed based on the features included in this study, $k$-means clustering method was used. Before performing the clustering process, the values in the design matrix were normalized to even out the magnitude of the values in each column. In the clustering process, values from $k=2$ to $k=9$ clusters were considered. To provide a visual representation of the clustering for each value of $k$, a Principal Component Analysis (PCA) was performed. Then, for each value of $k$, the two principal components with the highest variance explained were plotted in a two-dimensional plane to have an overview of the clustering. For a quantitative measure of the appropriate number of clusters, an internal validation was performed. Specifically, the internal validation criteria used were (1) sum of squared distances to the centroids, (2) Calinski-Harabasz index, (3) Silhouette coefficient and (4) Gap statistic.
After performing $k$-means clustering with number of clusters $k=2$ to $k=9$, and plotting the two principal components with the highest variance explained under PCA in a two-dimensional plane, the graphs in Figure 4 were derived.
X = df_num.iloc[:, :9].merge(df_cat_mfa, left_index=True, right_index=True)
X.iloc[:,7:9] = X.iloc[:,7:9].astype(int)
X_norm = normalize(X)
X_new = PCA(n_components=2, random_state=1337).fit_transform(X_norm)
res = cluster_range(X_norm, KMeans(random_state=1337), 2, 9, actual=None)
plot_clusters(X_new, res['ys']);
Then, the results of the internal validation using the four criteria mentioned are shown in Figure 5.
plot_internal(res['inertias'], res['chs'], res['scs'],
res['gss'], res['gssds']);
Looking at the internal validation results, the selected values of $k$ actually show relatively good results. Hence, for interpretability purpose, we chose $k=4$, that is, four clusters of Zomato-listed Makati restaurants. Basing on the SSE criterion, the ‘elbow’ can be seen at either $k=3$ or $k=4$. Although the highest values of the CH index and the Silhouette coefficient are not observed at $k=4$, the corresponding values at $k=4$ are somehow comparable being at the top three highest values under these indices. For the gap statistic criterion, although a good choice can be $k=5$, the gap statistic at $k=4$ is comparable.
The two-component (PCA) representation of this clustering is presented in Figure 6.
kmeans = KMeans(n_clusters=4, random_state=1337)
kmeans.fit_predict(X_norm);
kmeans_3 = KMeans(n_clusters=3, random_state=1337)
kmeans_3.fit_predict(X_norm);
kmeans_5 = KMeans(n_clusters=5, random_state=1337)
kmeans_5.fit_predict(X_norm);
cluster_color = {1:'#236E96',
2:'#F3872F',
3:'#EC5578',
4:'#FFD700'}
sns.scatterplot(X_new[:,0], X_new[:,1],
hue=(kmeans.labels_)+1, palette=cluster_color);
plt.xlabel('PCA 1');
plt.ylabel('PCA 2');
# Save results
results = pd.Series(kmeans.labels_)
results_3 = pd.Series(kmeans_3.labels_)
results_5 = pd.Series(kmeans_5.labels_)
df_clean['label'] = results
df_clean['label'] = (df_clean['label'] + 1).astype(str)
df_clean['label'] = df_clean['label'].replace({'1':'2', '2':'1',
'4':'3', '3':'4'})
df_clean['label_3'] = results_3
df_clean['label_3'] = (df_clean['label_3'] + 1).astype(str)
df_clean['label_5'] = results_5
df_clean['label_5'] = (df_clean['label_5'] + 1).astype(str)
The application of unsupervised clustering technique to the 2,375 Makati restaurants listed in Zomato resulted to 4 clusters with Cluster 2 dominating based on count. The distribution of the clusters based on count is shown in Figure 7.
tempdf = df_clean.groupby(['label']).size().reset_index().rename(columns = {0:'count'})
tempdf['total']= tempdf['count'].sum()
tempdf
tempdf['percent']= tempdf['count']/tempdf['total']*100
fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(x='label',y='percent',data=tempdf, color = '#E23744');
plt.ylabel('% of restaurants')
plt.xlabel('Cluster')
small = 13
plt.rc('font', size= small) # controls default text sizes
plt.rc('axes', titlesize= small) # fontsize of the axes title
plt.rc('axes', labelsize= small) # fontsize of the axes title
plt.rc('xtick', labelsize= small) # fontsize of the tick labels
plt.rc('ytick', labelsize= small) # fontsize of the tick labels
plt.rc('legend', fontsize= small) # legend fontsize
plt.rc('figure', titlesize= small)
Looking closely at the features of each cluster such as price, ratings, cuisines offered and amenities, we can label each cluster as:
Characteristics of each cluster is discussed in the succeeding sections. To get an idea of how restaurants are dispersed around Makati, refer to Figure 8.
viz = df_clean.copy()
viz = viz.rename(columns={'location.latitude':'latitude',
'location.longitude':'longitude'})
mapbox = open("mapbox_key.txt").read() # you need your own token
main = folium.Map([14.5547, 121.0244], zoom_start=14, tiles=None)
main.add_child(add_cluster(viz[viz['label'] == '1'],'Cluster 1', '#236E96'))
main.add_child(add_cluster(viz[viz['label'] == '2'],'Cluster 2', '#F3872F'))
main.add_child(add_cluster(viz[viz['label'] == '3'],'Cluster 3', '#EC5578'))
main.add_child(add_cluster(viz[viz['label'] == '4'],'Cluster 4', '#FFD700'))
folium.TileLayer(tiles=mapbox, attr='Mapbox attribution',
name='Cluster').add_to(main)
folium.LayerControl(collapsed=False).add_to(main)
main
with open('misc/barangays-municity-ph137602000.0.1.json', 'r') as f:
f = f.read()
data = json.loads(f)
data['features'] = [barangay for barangay in data['features']
if barangay['properties']['ADM4_EN'] not in
['Guadalupe Viejo', 'Kasilawan', 'Rizal', 'Singkamas',
'Post Proper Northside', 'Post Proper Southside']]
barangay = pd.read_sql('SELECT * from address', conn)
viz = df_clean.copy()
viz = viz.merge(barangay, left_on = 'id', right_on = 'id')
viz = viz.groupby(['barangay', 'label']).size().unstack().replace(np.nan, 0)
viz['assigned_cluster'] = viz.idxmax(axis=1)
viz = viz.reset_index()[['barangay', 'assigned_cluster']]
viz_2 = (df.merge(barangay,
left_on='id',
right_on='id')[['id', 'location.latitude', 'location.longitude', 'barangay']]
.merge(viz, left_on ='barangay', right_on = 'barangay')
.rename(columns={'location.latitude':'latitude',
'location.longitude':'longitude'}))
style = {'1':{'fillColor': '#236E96', 'color': '#236E96'},
'2':{'fillColor': '#F3872F', 'color': '#F3872F'},
'3':{'fillColor': '#EC5578', 'color': '#EC5578'},
'4':{'fillColor': '#FFD700', 'color': '#FFD700'}}
Drilling further on the clusters and location, we’ve identified the dominant cluster per barangay as plotted in Figure 9. Cluster 2 consistently covers majority of the barangays while there are no barangays that are dominated by Cluster 4.
lat = 14.5547
lon = 121.0444
main = folium.Map(
location=[lat, lon],
zoom_start=13,
tiles=None)
for barangay in data['features']:
extra = data.copy()
group = barangay['properties']['ADM4_EN']
cluster = viz_2[viz_2['barangay']==group]['assigned_cluster'].iloc[0]
barangay['properties']['cluster'] = f'<b> Cluster {cluster} </b>'
extra['features'] = [barangay]
feature_group = folium.FeatureGroup(group)
style_color = style[viz_2[viz_2['barangay']==group]['assigned_cluster'].iloc[0]],
choropleth = folium.Choropleth(extra, key_on= group, fill_color= style_color[0]['fillColor']).add_to(main)
choropleth.geojson.add_child(folium.features.GeoJsonTooltip(['ADM4_EN', 'cluster'], labels=False))
folium.TileLayer(tiles=mapbox, attr='Mapbox attribution',
name='Barangays').add_to(main)
folium.LayerControl(False).add_to(main)
main
In this subsection, we used the reviews of restaurants to be able to further understand the different clusters.
def grey_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(0, 60)
def plot_wc(df, cluster):
masks = {
'2': np.array(Image.open("./icons/chicken.png")),
'1': np.array(Image.open("./icons/milk-tea.png")),
'4': np.array(Image.open("./icons/serving-platter.png")),
'3': np.array(Image.open("./icons/plate.png")),
}
wc = WordCloud(
background_color="rgba(255, 255, 255, 0)",
mode="RGB", # RGBA for contours?
stopwords=stopwords,
width=500,
height=500,
mask=masks[cluster],
font_path='Tryst-Regular.otf',
color_func=grey_color_func,
)
corpus = df[df['label'] == cluster]['review'].to_list()
wc.generate(' '.join(corpus))
fig, ax = plt.subplots(1, 2, dpi=90, figsize=(10, 4))
ax[0].imshow(wc, interpolation='bilinear')
ax[0].axis('off')
ax[1].set_title('Most common words in cluster')
top_words = pd.DataFrame(reversed(list(
wc.words_.items())[:20]),
columns=['Top Words', 'Weight']
).set_index('Top Words').plot(kind='barh',
color='#cb202d', width=.8, ax=ax[1])
fig.tight_layout()
The cheapest cluster. It is dominated by on-the-go establishments – food cart, kiosks, bakeries, beverage shops and dessert parlors. Popular finger food establishments such as Potato Corner, Turks, Dunkin Donuts and Gong-cha can be found in this cluster. Since restaurants in this cluster mostly offers food that are quick to acquire, easy to eat, and frequently consumed, customers do not bother to leave comments resulting to low number of ratings and few reviews.
df_wc = pd.read_sql('SELECT * from reviews',
conn).merge(df_clean[['id', 'label']])
small = 8
plt.rc('font', size= small) # controls default text sizes
plt.rc('axes', titlesize= small) # fontsize of the axes title
plt.rc('axes', labelsize= small) # fontsize of the axes title
plt.rc('xtick', labelsize= small) # fontsize of the tick labels
plt.rc('ytick', labelsize= small) # fontsize of the tick labels
plt.rc('legend', fontsize= small) # legend fontsize
plt.rc('figure', titlesize= small) # fontsize of the figure title
stopwords = set(STOPWORDS)
stopwords.update(["place", "food", "restaurant", "one", "will",
"Makati", "ha", "na", "branch", "good",
'really', 'itâ','ordered', 'order'])
plot_wc(df_wc, '1')
Cluster 2 has the most number of restaurants among the 4 clusters. Fast food and coffee chains such as Jollibee, McDonalds, KFC, Starbucks, Coffee Bean and Tim Hortons makes up majority of the cluster. Quick service restaurants offer meals, but menus are quite limited. Usually, food is available as take-out though limited seats are provided as well.
plot_wc(df_wc, '2')
Cluster 3 are casual food places good for the gathering of friends and family. Restaurants like Max, Shakey’s, Army Navy and Yellow Cab are among those who are members of this cluster. Casual dining restaurants are priced twice as fast-food, but they offer menus with a wider range of choices and cuisines. Additionally, most casual dining restaurants offer alcohol and accepts credit card. Ambiance is good in these restaurants which encouraged customers to share photos and leave reviews and ratings.
plot_wc(df_wc, '3')
High-end restaurants can be found in Cluster 4 with meals costing as much as Php11,000 for two. These are always sit-down restaurants with fancier menus and waiter service. Service and customer experience are equally important with food offerings to these types of establishments. Customers don’t just talk about the food, but the great service and the staff as well resulting in more attached customers. This is evidenced by the number of comments and pictures shared by the customers.
plot_wc(df_wc, '4')
While it was concluded that grouping restaurants in Makati into $ k=4$ is the best choice, sensitivity analysis was performed to test the robustness of the results gathered.
Summarized below are the features of restaurant clusters for $k=3$ (Table 5) and $k=5$ (Table 6). It is noted that as the cluster count is decreased, the Quick Bites and Fast Food clusters combines into only one cluster with Casual and Full-Service clusters staying almost the same. On the other hand, clustering the restaurants into 5 resulted to splitting of the Casual and Full-Service clusters into 3. As such, it can be deduced that there are restaurants sharing common characteristics of Casual and Full-Service group, and there are subgroups of Full-Service restaurants.
| Feature | Cluster 1 | Cluster 2 | Cluster 3 |
|---|---|---|---|
| Restaurant Count | 1,373 | 738 | 264 |
| Average Cost for Two | ₱ 341 | ₱ 880 | ₱ 1,831 |
| Aggregate Rating | 1.73/5 | 3.11/5 | 3.68/5 |
| Average Count of Photos by Customers | 21 | 113 | 285 |
| Average Number Reviews | 9 | 39 | 76 |
| Table Reservation Availability | Yes | Yes | Yes |
| Feature | Cluster 1 | Cluster 2 | Cluster 3 | Cluster 4 | Cluster 5 |
|---|---|---|---|---|---|
| Restaurant Count | 582 | 825 | 611 | 280 | 77 |
| Average Cost for Two | ₱ 254 | ₱ 413 | ₱ 864 | ₱ 1,341 | ₱ 2720 |
| Aggregate Rating | 1.05/5 | 2.28/5 | 2.95/5 | 3.82/5 | 3.5/5 |
| Average Count of Photos by Customers | 11 | 33 | 80 | 329 | 190 |
| Average Number Reviews | 5 | 13 | 27 | 97 | 37 |
| Table Reservation Availability | No | Limited | Limited | Yes | Yes |
Using unsupervised clustering, we were able to find clusters of restaurants in Makati city using their feature. With $k$-means clustering, we obtained optimal value at $k = 4$ and determined the following clusters: Quick Bites, Fast Food, Casual Dining, and Full-Service. Overall, price and ratings are the features that clearly sets apart each of the four clusters. However, there are also a few differences based on other features and offerings such as cuisines offered and establishment type. A summary of the attributes of each cluster is presented in Table 7. The result of this study can be of help to customers when deciding where to eat, business owners when deciding location for business development, local government units, and Zomato.
| Feature | Cluster 1 | Cluster 2 | Cluster 3 | Cluster 4 |
|---|---|---|---|---|
| Restaurant Count | 589 | 843 | 708 | 235 |
| Average Cost for Two | ₱ 253.16 | ₱ 421.08 | ₱ 917.66 | ₱ 1,902.76 |
| Aggregate Rating | 1.05/5 | 2.3/5 | 3.14/5 | 3.64/5 |
| Average Count of Photos by Customers | 11 | 35 | 125 | 278 |
| Average Number Reviews | 5 | 14 | 41 | 74 |
| Table Reservation Availability | No | Limited | Yes | Yes |
| Cuisines Offered | Filipino, American, Chinese, Japanese, Korean | Japanese, Filipino, Asian, Korean, Chinese | Filipino, Japanese, American, Chinese, Korean | European, Japanese, American, Asian, Italian |
| Establishment Types | Bakery, Beverage Shop, Dessert Parlour, Kiosk | Cafe, Food Court, Kiosk, Quick Bites | Bar, Casual Dining, Lounge | Fine Dining, Club, Wine Bar, Lounge |
| Top Highlights | Cash, No Alcohol Available, Takeaway Available, Gcash Accepted, Table booking not available | Cash, No Alcohol Available, Restroom Available, Wifi, Has power Outlets, Table booking not available | Cash, Restroom Available, Table Booking Recommended, Credit Card, Beer, Wifi, Fullbar | Table Booking Recommended, Fullbar, Credit Card, Wifi, Restroom Available, Beer |
The following points can be considered in future research related to this work:
Abdi, H. and Valentin, D. (2007). Multiple Factor Analysis (MFA). Retrieved from: https://personal.utdallas.edu/~herve/Abdi-MFA2007-pretty.pdf
Aggarwal, C. (2015). Data Mining: The Textbook. New York: Springer Publishers
Faeldon, J. (2020). Philippines Administrative Boundaries JSON Maps. Github repository: https://github.com/faeldon/philippines-json-maps/
PhilGIS. (2020). Country Barangays File. Retrieved from: http://philgis.org/country-barangay/country-barangays-file