# importing required libraries 
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

#Data Manipulation and Storage
import pandas as pd
from tqdm import tqdm 

# Text Cleaning
import string
import re
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


#Generating n-grams
from gensim.models import Phrases


import dask.dataframe as dd
import dask.bag as db
import json
import nltk
import dask.array as da
# nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import seaborn as sns
pyLDAvis.enable_notebook()
%matplotlib inline


import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from IPython.core.display import HTML
import pprint

pp = pprint.PrettyPrinter(indent=4, width=100)



HTML('''
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}


</style>


<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')


from dask.distributed import Client
client = Client('172.31.21.2:8786')
client


def flatten(record):
    """Convert json to dataframe"""
    return {
        'link_id': record['link_id'],
        'body': record['body']
    }

# Read data from s3 bucket
bag = (db.read_text('s3://bdccreddit2020/*.gz', 
       storage_options={'anon': True, 'use_ssl': False}).map(json.loads))

# Count total number of raw comments in the file
# total_count = bag.count().compute()
##Count 75gb: 419,953,160

# Filter skincare subreddit and persist
skincare = (bag.filter(lambda category: 
                       category["subreddit"] == 'SkincareAddiction'))
skincare = skincare.persist()

# # Count total number of filtered skincare comments 
# total_skincare = skincare.compute().count()
# print("Count of skincare comments:", total_skincare)

ddf = skincare.map(flatten).to_dataframe()


print("Sample raw comment:\n\n", skincare.take(1)[0]['body'])

Sample raw comment:

 Yeah if you're that new to Differin you probably shouldn't have a facial and definitely not extractions. I'd highly recommend laying off of it and other actives for a good amount of time, maybe even give it a full month before you resume depending on if they did any sort of mask or peel during your treatment today.


def clean_text(text):     
    """Perform cleaning and preprocessing of data"""

    #make string lowercase 
    text = str(text)
    text = text.lower()
    
    #remove links
    text = (re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})'
                   '([\/\w \.-]*)', '', text, flags=re.MULTILINE))
    text = text.replace('/', ' ')
        
    #tokenize 
    tokens = nltk.word_tokenize(text)
    clean_text = []
    
    #remove stopwords, puncuation, then lemmatize
    for word in tokens:
        # Remove punctuations and special characters
        word = re.sub(r'\\n','', word)
        word = re.sub(r'\[.*?\]', '', word)
        word = re.sub(r'[^\w\s]', '', word)
        if (word not in stopwords_english and word not in string.punctuation):             
            if not word.isdigit():#re.search('\w(?:\w|\-)+', word):
                token = wordnet_lemmatizer.lemmatize(word)
                #remove 1 letter words
                token =  re.sub(r'\b[a-zA-Z]\b', '', token)
                clean_text.append(token)
                        
    return clean_text

def add_ngram(doc): 
    """Return trigrams of doc"""
    return trigram[bigram[doc]]


# "stopeng" is the extended list of stopwords for use 
stop = ["aint", "amnt", "arent", "cant", "couldve", "couldnt",
                    "daresnt", "didnt", "doesnt", "dont", "gonna", "gotta", 
                    "hadnt", "hasnt", "havent", "hed", "hell", "hes", "howd",
                    "howll", "hows", "Id", "Ill", "Im", "Ive", "isnt", "itd",
                    "itll", "its", "lets", "maynt", "mayve", "mightnt", 
                    "mightve", "mustnt", "mustve", "neednt", "oclock", "ol",
                    "oughtnt", "shant", "shed", "shell", "shes", "shouldve",
                    "shouldnt", "somebodys", "someones", "somethings", "thatll",
                    "thatre", "thats", "thatd", "thered", "therere", "theres", 
                    "thesere", "theyd", "theyll", "theyre", "theyve", "thiss",
                    "thosere", "tis", "twas", "twasnt", "wasnt", "wed", "wedve",
                    "well", "were", "weve", "werent", "whatd", "whatll", 
                    "whatre", "whats", "whatve", "whens", "whered", "wherere",
                    "wheres", "whereve", "whichs", "whod", "whodve", "wholl",
                    "whore", "whos", "whove", "whyd", "whyre", "whys", "wont",
                    "wouldve", "wouldnt", "yall", "youd", "youll", "youre", 
                    "youve", "s", "s", "said", "also", "would", "m", "n",
                    "re","ve" , "ll", "d", "t", "much", "u", "amp","ll", "i",
                    'also', 'really', 'ive', 'even', 'im', 'ill', 'lol',
                    'ugh', 'oof','yeah', 'haha' , 'oh', 'u', 'etc' ,'yes', 
                    'thing', 'go',"tho", "pih", "gt" ,"two" "one", "nt",
                    'jon', 'lot', 'could', 'many', 'thanks', 'thank', 'hi']


wordnet_lemmatizer = WordNetLemmatizer()
stopeng = stopwords.words('english') + stop
stopeng.extend([x.replace("\'", "") for x in stopeng])
stopwords_english = list(set(stopeng))


# Filter automated comments by bots
ddf1 = ddf.copy()
ddf1 = ddf1[~ddf1['body'].str.contains("Beep boop. ")]
ddf1 = ddf1[~ddf1['body'].str.contains("I am a bot, and this action was performed automatically.")]
ddf1 = ddf1[~ddf1['body'].str.contains("I'm afraid your post was removed")] 
ddf1 = ddf1[~ddf1['body'].str.contains("I'm afraid your post has been removed.")] 
ddf1 = ddf1[~ddf1['body'].str.contains("https://www.reddit.com/r/SkincareAddiction/wiki/")] 
ddf1 = ddf1[ddf1['body'] !='[deleted]']
ddf1 = ddf1[ddf1['body'] !='[removed]']
ddf1 = ddf1.compute()

#for checking of there are automated comments left
# ddf1a = ddf1.groupby(['body']).count().sort_values(by='link_id',ascending=False)

# Group comments by submission as one document
ddf1 = ddf1.groupby('link_id').agg(list)

# Perform cleaning on comments
ddf1['body_clean'] = ddf1['body'].apply(lambda x : clean_text(x))

# Apply n-grams
bigram = Phrases(ddf1['body_clean'], min_count=10)
trigram = Phrases(bigram[ddf1['body_clean']])
ddf1['body_ngrams'] = ddf1['body_clean'].apply(lambda x : add_ngram(x))


ddf1.sample(5)


from gensim.summarization import keywords
from PIL import Image
from itertools import chain
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import itertools


def plot_wordcloud():
    """Return the wordcloud figure of the text"""
    text = list(itertools.chain(*ddf1['body_ngrams']))
#     color=ImageColorGenerator(mask)
    plt.subplots(figsize=(13,8))
    wordcloud = WordCloud(max_words=1000, 
#                           min_font_size=8, 
                          collocations=False,
                          colormap='Set2',
                          mode = "RGBA", background_color='white',
                          stopwords = stopwords_english
                         ).generate(' '.join(text))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
#     plt.savefig(f'./wordcloud/wordcloud_{label+1}.png')
#     plt.tight_layout(pad=0);
plot_wordcloud()


# Build Dictionary & Corpus for Topic Model

#set dictionary and corpus
docs = ddf1['body_ngrams'].values
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token


def compute_coherence_values(dictionary, corpus, texts, id2word, limit=40, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    
    chunksize = 1000
    passes = 15
    iterations = 10
    eval_every = 1  
    coherence_values = []
    model_list = []
    
    for num_topics in tqdm(range(start, limit, step)):
        model=(LdaModel(corpus=corpus, id2word=id2word, 
                       chunksize=chunksize, 
                       alpha='auto', eta='auto', 
                       random_state=100,
                       iterations=iterations, 
                       num_topics=num_topics, 
                       passes=passes))
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


def plot_cv_scores(coherence_values, limit=40, start=2, step=1):
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()


#RUN TO GET APPROPRIATE NUMBER OF TOPICS
model_list, coherence_values = (compute_coherence_values(dictionary, 
                                                         corpus, docs, id2word, 
                                                         limit=17, start=9, step=1))

100%|██████████| 8/8 [26:01<00:00, 195.23s/it]


plt.subplots(figsize=(10,7))
plot_cv_scores(coherence_values, limit=17, start=9, step=1)


opt_model =  models.LdaModel.load('./BDCC Final Project/models/lda.model')


# opt_model = model_list[6]
pyLDAvis.gensim.prepare(opt_model, corpus, dictionary)


pd.set_option('display.max_colwidth', 310)
pd.DataFrame(opt_model.print_topics(num_words=10), columns=['Topic','Keywords']).set_index('Topic')


# Assign label for each topic and get distribution 

topics = {0: 'Other Skin Issues', 1: 'Brightening', 
          2: 'Feedbacks, Tips, Recommendations',
          3: 'Facial Cleansers', 4: 'Acne', 
          5: 'Feedbacks, Tips, Recommendations',
          6: 'Hydration', 7: 'Body Skincare', 8: 'Acne', 
          9: 'Product Ingredients', 10: 'Lip Care', 11: 'Sun Protection',
          12:'Scars', 13:'Acne', 14: 'Feedbacks, Tips, Recommendations'}
label_desc = {0: 'Other Skin Issues', 1: 'Brightening', 2: 'Feedbacks',
          3: 'Facial Cleansers', 4: 'Acne (Severe)', 5: 'Tips',
          6: 'Hydration', 7: 'Body Skincare', 8: 'Acne (Product Ingredients)', 
          9: 'Product Ingredients', 10: 'Lip Care', 11: 'Sun Protection',
          12:'Scars', 13:'Acne (Issues)', 14: 'Recommendations'}
            
topic_dist = opt_model.get_document_topics(corpus)
label = [sorted(topic_dist[i],key=lambda x: -x[1])[0][0] 
          for i in range(len(ddf1))]
ddf1['label'] = label
ddf1['label_desc'] = ddf1['label'].replace(label_desc)
ddf1['topic'] = ddf1['label'].replace(topics)


# plot distribution
plt.subplots(figsize=(15,7))
plt.title("Topic Distribution")
sns.countplot(ddf1['label_desc'], data=ddf1)
plt.xlabel("Cluster Label")
plt.xticks(rotation=30,horizontalalignment='right');

/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


# data = list(itertools.chain(*ddf1['body_ngrams'].values.tolist()))
data = ddf1['body_ngrams'].values.tolist()


def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = (sent_topics_df.append(pd.Series([int(topic_num), 
                        round(prop_topic,4), topic_keywords]), ignore_index=True))
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


import itertools

df_topic_sents_keywords = format_topics_sentences(ldamodel=opt_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)


from gensim.summarization import keywords
from PIL import Image
from itertools import chain
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import itertools


def plot_wordcloud(label, ax=None):
    """Returns the wordcloud figure of the topic"""
    
    if ax is None:
        ax = plt.gca()
    
    text = list(itertools.chain(*ddf1[ddf1.label==label]['body_ngrams']))
    mask_path = f'./BDCC Final Project/icons/{label+1}.png'
    mask = np.array(Image.open(mask_path))
#     color=ImageColorGenerator(mask)
#     plt.subplots(figsize=(5,5))
    wordcloud = WordCloud(max_words=150, 
                          min_font_size=8, mask=mask, #prefer_horizontal=0.75, 
                          collocations=False,
                          colormap='Set2',
                          mode = "RGBA", background_color='white',
                          stopwords = stopwords_english
                         ).generate(' '.join(text))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off');
#     plt.savefig(f'./wordcloud/wordcloud_{label+1}.png')
#     ax.tight_layout(pad=0);


plt.close()
f, (ax) = plt.subplots(2, 4, figsize=(13,7), dpi=160)
plot_wordcloud(4, ax[0][0])
ax[0][0].set_title("Severe Acne")

plot_wordcloud(13, ax[0][1])
ax[0][1].set_title("Acne Issues")

plot_wordcloud(8, ax[0][2])
ax[0][2].set_title("Acne Product Ingredients")

plot_wordcloud(1, ax[0][3])
ax[0][3].set_title("Bright Skin")

plot_wordcloud(6, ax[1][0])
ax[1][0].set_title("Hydration")

plot_wordcloud(11, ax[1][1])
ax[1][1].set_title("Sun Protection")

plot_wordcloud(6, ax[1][2])
ax[1][2].set_title("Scars")

plot_wordcloud(6, ax[1][3])
ax[1][3].set_title("Other Skin Concerns")

plt.tight_layout(pad=0);


f, (ax1,ax2) = plt.subplots(1,2, figsize=(13,5),dpi=160)
plot_wordcloud(10, ax1)
ax1.set_title("Lip Care")

plot_wordcloud(7, ax2)
ax2.set_title("Body Care");


f, (ax1,ax2) = plt.subplots(1,2, figsize=(13,5),dpi=160)
plot_wordcloud(3, ax1)
ax1.set_title("Facial Cleansers")

plot_wordcloud(9, ax2)
ax2.set_title("Product Ingredients");


plt.close()
f, (ax1, ax2,ax3) = plt.subplots(1, 3, figsize=(13,5), dpi=160)
plot_wordcloud(2, ax1)
ax1.set_title('Feedbacks')
plot_wordcloud(5, ax2)
ax2.set_title('Tips')
plot_wordcloud(14, ax3)
ax3.set_title('Recommendations')
plt.tight_layout(pad=0);

Feature	Description
all_awardings	List of every single award given to a post and how many times each award was given
author	The author of the subreddit post.
author_created_utc	The date the account is created in UTC time
author_flair_background_color	The author's flair background color
author_flair_css_class	The flair CSS class for the author
author_flair_template_id	The ID number of the flair template
author_flair_text	The text for the author's flair.
author_flair_text_color	The text color of the flair
author_flair_type	The tagging type of the flair
author_fullname	The user ID of the author
author_patreon_flair	Special flair for Patreon creators
author_premium	Boolean if the author is a premium user
body	The text content of the comment.
can_gild	Whether or not this link can be "gilded" by giving the link author Reddit Gold
controversiality	The controversiality score of the comment
created_utc	The UTC time the link was created
distinguished	Whether or not the link has been distinguished by a moderator.
edited	The UNIX Time stamp the link has been edited or false
gilded	The number of times the link has been gilded.
gildings	List of users gilded the post
id	The ID of the link submission.
link_id	The ID of the comment
parent_id	ID of the thing this comment is a reply to, either the link or a comment in it
permalink	The link for the comment
removal_reason	Reason provided by moderator for removal of the comment.
score	The net-score of the comment
stickied	True if the Comment is set as the sticky in its thread.
subreddit	Subreddit of comment excluding the r/ prefix. "pics"
subrredit_id	The id of the subreddit in which the comment is located
subreddit_name_prefixed	Subreddit of comment including the r/ prefix. "pics"
subreddit_type	The subreddit's type - one of "public", "private", "restricted", or in very special cases "gold_restricted" or "archived"

	body	body_clean	body_ngrams
link_id
t3_ez0fyt	[Your skins looks great to me you seem to red ...	[skin, look, great, seem, red, scar, try, put,...	[skin, look, great, seem, red, scar, try, put,...
t3_fqy7r1	[Currently I have a bunch of different things ...	[currently, bunch, different, thing, sample, c...	[currently, bunch_different, thing, sample, co...
t3_etatut	[Can anyone tell me what niacinamide is suppos...	[anyone, tell, niacinamide, suppose, ordinary,...	[anyone_tell, niacinamide, suppose, ordinary, ...
t3_fmto2b	[AHAs and BHAs are exfoliators are they not?]	[ahas, bhas, exfoliators]	[ahas_bhas, exfoliators]
t3_f3qvoe	[Do you have a picture?]	[picture]	[picture]

	Keywords
Topic
0	0.020"product" + 0.017"using" + 0.016"face" + 0.011"try" + 0.010"issue" + 0.010"sensitive" + 0.009"cream" + 0.009"think" + 0.009"maybe" + 0.009"tried"
1	0.033"vitamin_" + 0.031"serum" + 0.028"retinol" + 0.026"product" + 0.015"ordinary" + 0.011"eye" + 0.011"like" + 0.010"good" + 0.010"vitamin__serum" + 0.009"one"
2	0.015"like" + 0.011"time" + 0.010"work" + 0.010"one" + 0.010"love" + 0.010"get" + 0.010"face" + 0.007"people" + 0.007"good" + 0.007"look"
3	0.046"cleanser" + 0.021"oil" + 0.019"face" + 0.015"oily" + 0.014"product" + 0.014"moisturizer" + 0.013"routine" + 0.011"dry" + 0.011"using" + 0.011"good"
4	0.030"acne" + 0.012"get" + 0.009"accutane" + 0.009"doctor" + 0.008"help" + 0.008"take" + 0.007"know" + 0.006"diet" + 0.006"think" + 0.006"year"
5	0.024"people" + 0.021"product" + 0.015"like" + 0.013"think" + 0.013"ingredient" + 0.011"skincare" + 0.010"know" + 0.009"say" + 0.007"mean" + 0.007"brand"
6	0.022"like" + 0.022"moisturizer" + 0.020"love" + 0.017"one" + 0.016"cream" + 0.016"dry" + 0.011"face" + 0.010"great" + 0.010"tried" + 0.009"feel"
7	0.024"hair" + 0.015"get" + 0.014"body" + 0.013"shower" + 0.012"look" + 0.011"help" + 0.010"like" + 0.009"try" + 0.007"shaving" + 0.007"treatment"
8	0.031"acne" + 0.023"using" + 0.020"routine" + 0.018"tretinoin" + 0.016"differin" + 0.015"tret" + 0.013"help" + 0.012"week" + 0.012"moisturizer" + 0.011"azelaic_acid"
9	0.036"product" + 0.021"using" + 0.017"niacinamide" + 0.015"routine" + 0.015"moisturizer" + 0.015"one" + 0.012"acid" + 0.012"need" + 0.012"toner" + 0.011"start"
10	0.025"lip" + 0.025"hand" + 0.019"vaseline" + 0.018"dry" + 0.013"occlusive" + 0.012"eczema" + 0.011"cream" + 0.011"aquaphor" + 0.011"lotion" + 0.011"help"
11	0.090"sunscreen" + 0.032"spf" + 0.015"sun" + 0.014"one" + 0.008"look" + 0.008"face" + 0.007"makeup" + 0.007"good" + 0.007"day" + 0.007"like"
12	0.018"month" + 0.014"used" + 0.012"scar" + 0.011"time" + 0.011"week" + 0.011"definitely" + 0.010"work" + 0.010"get" + 0.009"got" + 0.009"result"
13	0.030"get" + 0.021"help" + 0.017"pimple" + 0.016"spot" + 0.014"face" + 0.011"one" + 0.011"like" + 0.011"try" + 0.010"area" + 0.009"usually"
14	0.038"product" + 0.018"one" + 0.015"buy" + 0.014"get" + 0.010"amazon" + 0.010"brand" + 0.009"know" + 0.009"post" + 0.009"like" + 0.008"bought"

	Document_No	Dominant_Topic	Topic_Perc_Contrib	Keywords	Text
0	0	9.0	0.1095	product, using, niacinamide, routine, moisturizer, one, acid, need, toner, start	[]
1	1	0.0	0.3187	product, using, face, try, issue, sensitive, cream, think, maybe, tried	[give, urea_cream, try, help, dealing, dehydrated, skin, month, literally, tried, everything_except, urea, trying, see, worked, people, actually, try, work, actually, irritated, skin, made, skin, worse]
2	2	8.0	0.2370	acne, using, routine, tretinoin, differin, tret, help, week, moisturizer, azelaic_acid	[completely, second, like, necessary, kind, start_slowly, gradual, improvement, noticeable, clearing, congestion, angry_pimple, still, breakout, bit]
3	3	0.0	0.3307	product, using, face, try, issue, sensitive, cream, think, maybe, tried	[stop_using, hydrocortisone, eyelid, ask_doctor, protopic, ointment, steroid, safer, thin]
4	4	14.0	0.2183	product, one, buy, get, amazon, brand, know, post, like, bought	[look_amazing, know, name, kind, acid, fill, got, lt, received, hyaluronic_acid, filler, pitted_scarring, received, treatment, four, time, across, span, year, month]
5	5	12.0	0.3875	month, used, scar, time, week, definitely, work, get, got, result	[hey, x200b, start, treatment, hey, session_microneedling, seen, improvementi, actually, going, rf_microneedling, tomorrow, difference]
6	6	8.0	0.2522	acne, using, routine, tretinoin, differin, tret, help, week, moisturizer, azelaic_acid	[started_getting, ton, clogged_pore, forehead, year, never, issue, acne, either, soooo, frustrating, thing, usually, oil, using, moisturizers, protini, usually, keep_bay, acid, differin, always, return, somewhat]
7	7	14.0	0.1620	product, one, buy, get, amazon, brand, know, post, like, bought	[always, use, burner, card, free_trial, use]
8	8	6.0	0.1596	like, moisturizer, love, one, cream, dry, face, great, tried, feel	[second, literally, invisible, tiny_bit, shiny, waiting, see, break]
9	9	11.0	0.5850	sunscreen, spf, sun, one, look, face, makeup, good, day, like	[check, mineral, matte, tint, website, variant31197576560682a, universally, flattering, tint, offer, light_coverage, mattifying, finish, even, look, imperfection]

1. Data Gathering¶

Client

Cluster

2. Data Cleaning and Pre-processing¶

3. Exploratory Data Analysis¶

4. Topic Modelling¶