Philippine Startups Wordcloud
In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
In [3]:
raw = pd.read_csv("../files/Philippine Startups - Sheet1.csv")
In [4]:
descriptions = raw['Long Description']
descriptions.head()
Out[4]:
In [5]:
raw_words = word_tokenize(" ".join(descriptions))
In [6]:
stop_words = set(stopwords.words('english') + list(punctuation))
words = [w.lower() for w in raw_words if w.lower() not in stop_words and not w.isdigit() and len(w) > 3]
In [7]:
words[:20]
Out[7]:
In [8]:
word_str = " ".join(words)
word_str[:1000]
Out[8]:
In [9]:
with open("../files/phstartupwords.txt","w") as f:
f.write(word_str)
Lazy Wordcloud Visualization¶
Enter the contents of the file generated into http://www.wordclouds.com/, and manually remove the words that occur less than 3 times:
Comments
Comments powered by Disqus