Text Mining for Indonesian Online News Articles about Corona
A workflow on scraping a list of news articles from tirto.id and detik.com about the Coronavirus using the BeautifulSoup package. The contents will be saved to individual tab-separated value (TSV) files, which will be cleaned and loaded again for further analysis. We then analyze the posting pattern for each site and train a Word2Vec model using the gensim package to analyze the semantic and syntactic similarity between each preprocessed word.
from bs4 import BeautifulSoup
import requests
import pandas as pd
# text preprocessing
import re
import dateparser
from datetime import datetime
from itertools import repeat
from gensim.utils import simple_preprocess
# analyzing text
from gensim.models import Word2Vec
from nltk.probability import FreqDist
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
# visualization
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
from adjustText import adjust_text
# other
import os
import time
import math
import multiprocessing
from tqdm import tqdm
from jupyterthemes import jtplot
jtplot.style()
pd.options.mode.chained_assignment = None
Topic of Interest
Before we jump into scraping step, let's us define a query
which will be our topic of interest. At the time of writing this post, Coronavirus is considered as the next viral pandemic of public health - tens of thousands of people are infected which causing thousands of death. Here, we are interested to see how the Indonesian news media reports on "corona"
.
query = "corona"
Scraping
There will be two sites as our main source of analysis: tirto.id and detik.com. The workflow for scraping is as follow:
- Access the link, followed by the search
query
parameter - Get the total page number found based on
query
- Get the title and URL for each article
- Loop through each URL to scrape the content of each article and its detail (Category, Author Name, Posted Date)
- Save to .tsv file
Access the Link
In this step, we access tirto.id followed by the "corona"
query and scrape the raw HTML content.
url_base = "https://tirto.id"
url_query = url_base + "/search?q=" + query
req = requests.get(url_query)
soup = BeautifulSoup(req.content, "html.parser")
str(soup)[:500]
try:
find_pagination = soup.findAll("li", attrs={"class": "pagination-item"})
pagination_list = [row.a.text for row in find_pagination]
total_page = int(pagination_list[-2])
print("Total Page: {}".format(total_page))
except:
print("Article Not Found")
tirtoid_articles = []
for page_num in tqdm(range(1, total_page+1)):
url = url_query + "&p=" + str(page_num)
r = requests.get(url)
s = BeautifulSoup(r.content, "html.parser")
find_article = s.findAll("div", attrs={"class": "news-list-fade"})
for row in find_article:
article = {}
article['title'] = row.h1.text
article['url'] = url_base + row.a['href']
tirtoid_articles.append(article)
print("Total Articles: {}".format(len(tirtoid_articles)))
tirtoid_articles[:3]
counter = 0
for article in tqdm(tirtoid_articles):
counter += 1
# access the article url
req_article = requests.get(article['url'])
soup_article = BeautifulSoup(req_article.content, "html.parser")
# preprocessing html
for s in soup_article(['script', 'style']):
s.decompose()
for br in soup_article.find_all("br"):
br.replace_with(" ")
# get article category
find_category = soup_article.findAll("a", attrs={"itemprop": "item"})
article['category'] = find_category[-1].text if len(find_category) else ""
# get author name and posted date
find_author_date = soup_article.find(
"span", attrs={"class": "detail-date"})
match = re.search(":[a-zA-Z\\.\\s]+-", find_author_date.text)
if match is not None:
article['author_name'] = re.sub(
r'\t', '', match.group(0)[2:-2].title())
match = re.search("\\d{1,2} [a-zA-Z]+ \\d{4}", find_author_date.text)
if match is not None:
article['posted_date'] = match.group(0)
# get article content (but exclude the "Baca juga" section)
find_baca_juga_section = soup_article.find(
"div", attrs={"class": "baca-holder"})
try:
if find_baca_juga_section is not None:
row.decompose()
except:
pass
content = ""
article_table = soup_article.findAll(
"div", attrs={"class": "content-text-editor"})[:-1]
article['content'] = " ".join(
[re.sub(r'\s+', ' ', row.text) for row in article_table])
tirtoid_df = pd.DataFrame(tirtoid_articles)
tirtoid_df.to_csv("output/tirtoid_{}.tsv".format(query), sep="\t", index=False)
Access the Link
In this step, we access detik.com followed by the "corona"
query and scrape the raw HTML content.
url_base = "https://www.detik.com"
url_query = url_base + "/search/searchnews?query=" + query
req = requests.get(url_query)
soup = BeautifulSoup(req.content, "html.parser")
str(soup)[:500]
try:
find_total_article = soup.find("div", attrs={"class": "search-result"})
total_article_match = re.search("\\d+", find_total_article.span.text)
total_article = int(total_article_match.group(0))
total_page = int(math.ceil(total_article/9))
total_page = min(1111, total_page) # detik only provides max. 1111 pages
print("Total Page: {}".format(total_page))
except:
print("Article Not Found")
detikcom_articles = []
counter = 0
for page_num in tqdm(range(1, total_page+1)):
counter += 1
url = url_query + "&page=" + str(page_num)
r = requests.get(url)
s = BeautifulSoup(r.content, "html.parser")
find_article = s.findAll("article")
for row in find_article:
article = {}
# get url
article['url'] = row.a['href']
# get title
article['title'] = row.h2.text
# get category
find_category = row.find("span", attrs={"class": "category"})
article['category'] = find_category.text
find_category.decompose()
# get posted date
article['posted_date'] = row.find("span", attrs={"class": "date"}).text
detikcom_articles.append(article)
print("Total Articles: {}".format(len(detikcom_articles)))
detikcom_articles[:3]
counter = 0
for article in tqdm(detikcom_articles):
counter += 1
# access the article url
try:
req_article = requests.get(article['url'] + "?single=1")
except:
continue
soup_article = BeautifulSoup(req_article.content, "html.parser")
# preprocessing html
for s in soup_article(['script', 'style']):
s.decompose()
for br in soup_article.find_all("br"):
br.replace_with(" ")
# get author name
match = re.search("[a-zA-Z\\.\\s]+-", find_author_date.text)
if match is not None:
article['author_name'] = match.group(0)[:-2].title()
# get article content
content = ""
find_div = soup_article.find("div", attrs={"class": "detail__body-text"})
if find_div is None:
find_div = soup_article.find("div", attrs={"class": "itp_bodycontent"})
if find_div is None:
find_div = soup_article.find("div", attrs={"class": "detail_text"})
if find_div is not None:
article_content = find_div.findAll("p")
if len(article_content) == 0:
article_content = [find_div]
article['content'] = " ".join(
[re.sub(r'\s+', ' ', row.text) for row in article_content])
else:
article['content'] = ""
detikcom_df = pd.DataFrame(detikcom_articles)
detikcom_df.to_csv("output/detikcom_{}.tsv".format(query),
sep="\t", index=False)
tirtoid_articles = pd.read_csv("output/tirtoid_{}.tsv".format(query),
sep="\t",
parse_dates=['posted_date'],
date_parser=dateparser.parse).replace(np.nan, '', regex=True)
tirtoid_articles['site'] = 'tirtoid'
tirtoid_articles.head()
detikcom_articles = pd.read_csv(
"output/detikcom_{}.tsv".format(query), sep="\t").replace(np.nan, '', regex=True)
detikcom_articles['posted_date'] = detikcom_articles['posted_date'].str.extract(
'(\d{2} [A-Za-z]+ \d{4})')[0].apply(dateparser.parse)
detikcom_articles['site'] = 'detikcom'
detikcom_articles.head()
Another things to do are:
- Combine both of the
DataFrame
into one - Convert column
category
andsite
as categorical data type
combined_articles = pd.concat(
[tirtoid_articles, detikcom_articles], ignore_index=True)
combined_articles[['category', 'site']] = combined_articles[[
'category', 'site']].astype('category')
combined_articles.dtypes
Here's the article count for each site:
combined_articles.groupby('site')['content'].count()
start_date = '2020-01-01'
current_date = datetime.now().strftime("%Y-%m-%d")
articles_2020 = combined_articles[(combined_articles['posted_date'] >= start_date) & (
combined_articles['posted_date'] < current_date)]
print('Percentage of Articles before {}: ~{:.2f}%'.format(
start_date, 100*(1-articles_2020.shape[0]/combined_articles.shape[0])))
articles_2020.head()
Let's do aggregation for articles_2020
as follow:
- Count articles by each
site
andposted_date
- Total the aggregated
articles_count
across each row - Set
posted_date
as index - Replace NaN with 0
articles_count = articles_2020.groupby(['site', 'posted_date'])[
'content'].count().unstack(level=0)
# total
articles_count.columns = articles_count.columns.add_categories(['total'])
articles_count['total'] = articles_count.sum(axis=1)
# reindex
date_2020 = pd.date_range(start=start_date, end=max(articles_count.index))
articles_count = articles_count.reindex(date_2020)
# replace
articles_count = articles_count.fillna(0)
articles_count.tail()
# line plot
ax = articles_count.plot(xlim=pd.Timestamp(start_date),
grid=True,
colormap='jet')
# mean line
mean_value = articles_count['total'].mean()
plt.axhline(y=mean_value,
color='r',
linestyle='--',
lw=1)
plt.text(s="Total count mean",
x=pd.Timestamp('2020-01-04'),
y=mean_value + 10,
color='r')
# set x axis ticks
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
ax.xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=4))
ax.xaxis.set_minor_formatter(mdates.DateFormatter("%d"))
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
# modify legend title
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
# add label
plt.xlabel('Posted Date')
plt.ylabel('Count')
plt.title("Articles Count on {} (2020)".format(query.title()))
plt.show()
From the line plot, we can conclude that:
- The articles on
"corona"
blew up starting from the third week of January 2020 and reach its peak on the first week of February 2020. - There is a seasonality which cycles every week. Each cycle peaks during the first half of a week and lowers during the weekend.
-
detik.com post
"corona"
articles about four times more frequent than tirto.id did.
Text Preprocessing
Before we jump into the analysis, the scraped text must be preprocessed by:
- Removing stopwords, which are generally a list of most common words used in a language
- Removing numbers and punctuation
These words are removed because do not provide any useful information to decide the context of a sentences. Here are the references for Bahasa Indonesia stopwords available on data_input/stopwords-list
folder:
- sastrawi-stopwords.txt
- pebbie-pebahasa.txt
- aliakbars-bilp.txt
- fpmipa-stopwords.txt
- week-month-name-id.txt: self-defined stopwords containing week name and month name in Indonesian.
stopwords_path = "data_input/stopwords-list/stopwords-id"
stopwords_list = []
for filename in os.listdir(stopwords_path):
stopwords = list(
open("{}/{}".format(stopwords_path, filename)).read().splitlines())
stopwords_list.extend(stopwords)
stopwords_list = sorted(set(stopwords_list))
week_month_name = list(open(
"data_input/stopwords-list/stopwords-id/week-month-name-id.txt").read().splitlines())
custom_stopwords = ['tirto.id', 'baca juga', 'gambas', 'video detik', 'rp']
def remove_words(sentence, words2remove):
for word in words2remove:
sentence = re.sub(r'\b' + word + r'\b', '', sentence.lower())
sentence = re.sub(r'\s+', ' ', sentence).strip()
return sentence
def text_cleaning(text):
col_name = text.name
# remove stopwords
print("Removing stopwords of {}".format(col_name))
time.sleep(0.5)
text = list(map(remove_words, tqdm(text), repeat(
stopwords_list + custom_stopwords + week_month_name)))
# remove numbers and punctuations
text = list(map(simple_preprocess, text))
clean_text = list(map(' '.join, text))
return clean_text
clean_title = text_cleaning(articles_2020['title'])
clean_content = text_cleaning(articles_2020['content'])
Save the preprocessed title
and content
to a seperate .tsv file.
articles_2020['clean_title'] = clean_title
articles_2020['clean_content'] = clean_content
articles_2020.to_csv("output/articles_2020_clean_{}.tsv".format(query), sep="\t", index=False)
articles_2020_clean = pd.read_csv("output/articles_2020_clean_{}.tsv".format(query), sep="\t")
articles_2020_clean.head()
Here are the list of preprocessed text to be analyze:
clean_text = list(pd.concat([articles_2020_clean['clean_title'],
articles_2020_clean['clean_content']]).sort_index().dropna())
clean_text[:2]
top_n_words = 50
tokens = ' '.join(clean_text).split(' ')
fd = FreqDist(tokens)
word_freq = pd.DataFrame(list(fd.items()), columns=["Word", "Frequency"])\
.sort_values(by='Frequency', ascending=False)
top_50_words = word_freq[:top_n_words]
top_50_words.head()
ax = top_50_words.plot(kind='barh', x='Word')
ax.invert_yaxis()
ax.get_legend().remove()
# add label
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.title("Top {} Word Count on {} Articles".format(
top_n_words, query.title()))
plt.gcf().set_size_inches(6, 8)
plt.show()
From the bar chart, it's obvious that 'virus'
and 'corona'
will appear as the most frequent words. But there are several words from the same category being mentioned, such as:
- Country/city:
'china', 'indonesia', 'wuhan', 'jakarta', 'natuna', 'singapura', 'jepang', 'as', 'hubei'
- Institute:
'kesehatan', 'pemerintah', 'kementerian', 'menteri', 'who'
- Impact:
'masker', 'kapal', 'penerbangan', 'pesawat', 'ekonomi', 'bandara', 'harga', 'pasar'
# define mask
mask = np.array(Image.open("data_input/china-map.png"))
mask[mask == 0] = 255
# create wordcloud
wordcloud_text = ' '.join(clean_text)
wordcloud = WordCloud(
max_words=top_n_words,
background_color="white",
mask=mask).generate(wordcloud_text)
# visualize
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[10, 10])
plt.imshow(wordcloud.recolor(color_func=image_colors),
interpolation='bilinear')
plt.axis("off")
plt.show()
unique_words = set(tokens)
print("Token size: {} words".format(len(tokens)))
print("Unique token size: {} words".format(len(unique_words)))
avg_freq_token = len(tokens)/len(unique_words)
print("Average Frequency for each token: {0:.2f} times".format(avg_freq_token))
Training Model
We will be using gensim
packages to train the model. Here are the explanation of each parameters (Documentation):
-
sentences
= list of tokens to be trained -
size
= dimensionality of word vectors -
window
= max. distance between the current and predicted word within a sentence -
min_count
= ignore tokens with total frequency below this number -
workers
= number of workers thread to train the model -
iter
= number of iteration over the corpus -
seed
= seed for the random number generator
clean_token = list(map(simple_preprocess, clean_text))
model = Word2Vec(clean_token,
size=500,
window=5,
min_count=math.ceil(avg_freq_token),
workers=multiprocessing.cpu_count() - 1,
iter=1000,
seed=123)
model.save("cache/word2vec_{}.model".format(query))
Rather than we train the same model every time we load this notebook, let's us use the pre-trained model.
model = Word2Vec.load("cache/word2vec_{}.model".format(query))
Dimensionality Reduction
The size of previous Word2Vec model is defined to be 500 dimensions. In order to visualize it in a plot, we have to reduce the dimensionality to 2 or 3 dimensions. In this section, we use PCA and t-SNE to reduce the dimension into 2 (x and y coordinates). The output of the two methods will be different, giving us two plots from different perspective.
vocab = list(model.wv.vocab)
pca = PCA(n_components=2, random_state=123)
X_pca = pca.fit_transform(model[vocab])
word_plot_pca = pd.DataFrame(X_pca, index=vocab, columns=['x', 'y'])
word_plot_pca.head()
vocab = list(model.wv.vocab)
tsne = TSNE(n_components=2, random_state=123)
X_tsne = tsne.fit_transform(model[vocab])
word_plot_tsne = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])
word_plot_tsne.head()
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2)
ax1.scatter(word_plot_pca['x'], word_plot_pca['y'])
for word, row in word_plot_pca.iterrows():
ax1.annotate(word, (row['x'], row['y']))
ax2.scatter(word_plot_tsne['x'], word_plot_tsne['y'])
for word, row in word_plot_tsne.iterrows():
ax2.annotate(word, (row['x'], row['y']))
ax1.title.set_text("Cluster of Words (PCA)")
ax2.title.set_text("Cluster of Words (T-SNE)")
plt.setp([ax1, ax2], xticks=[], yticks=[])
plt.gcf().set_size_inches(15, 5)
plt.tight_layout()
plt.show()
But unfortunately both of the plot looks very unpleasant, crowded with the tokens. Therefore, it's better for us to construct a word similarity network.
Word Similarity Network
In this very last section, we try to build a word similarity network for a cleaner visualization. Two words will be linked together if they are similar to each other based on cosine similarity. This value can be obtained by using model.wv.most_similar()
function. Here's the explanation of each parameter for plot_word_similarity_network()
function:
-
word_plot
=DataFrame
which contains the vector representation of each word -
root
= word of interest -
neighbour
= number of most similar words to be linked together -
levels
= max. number of connection levels, root is considered aslevel = 0
def plot_word_similarity_network(word_plot, root, neighbour, levels):
fig, ax = plt.subplots()
colors = 'rbycgmk'
def text_sizes(x): return 26-3*x
hierarchy_dict = {}
plotted_words = [root]
annotation_list = []
avoid_list = []
for level in range(levels+1):
if level == 0:
# only plot root word
coord = (word_plot.loc[root]['x'], word_plot.loc[root]['y'])
# plot point
p = ax.scatter(coord[0], coord[1],
s=100, c=colors[level],
label="Root")
avoid_list.append(p)
# annotate
txt = ax.text(coord[0], coord[1], root,
size=text_sizes(level),
color=colors[level])
annotation_list.append(txt)
similar_words = [root]
else:
current_hierarchy_words = []
hierarchy_words = []
for word in similar_words:
next_similar_words = [word for word, sim in model.wv.most_similar(word)[
:neighbour]]
for sim_word in next_similar_words:
if sim_word not in plotted_words:
hierarchy_words.append(
"{} -> {}".format(word, sim_word))
current_hierarchy_words.append(sim_word)
coord = (word_plot.loc[sim_word]['x'],
word_plot.loc[sim_word]['y'])
# plot line
l = ax.annotate('',
xy=(word_plot.loc[word]['x'],
word_plot.loc[word]['y']),
xytext=coord,
arrowprops=dict(arrowstyle='-', lw=1, color=colors[level-1]))
# plot point
p = ax.scatter(coord[0], coord[1],
s=100, c=colors[level],
label="Level {}".format(level))
avoid_list.append(p)
# annotate
txt = ax.text(coord[0], coord[1], sim_word,
size=text_sizes(level),
color=colors[level])
annotation_list.append(txt)
plotted_words.append(sim_word)
similar_words = current_hierarchy_words
hierarchy_dict[level] = hierarchy_words
# show legend, without duplicate
handles, labels = ax.get_legend_handles_labels()
unique = [(h, l) for i, (h, l) in enumerate(
zip(handles, labels)) if l not in labels[:i]]
ax.legend(*zip(*unique), loc="upper right", bbox_to_anchor=(0, 1))
# modify axis, title, and size
plt.xticks([])
plt.yticks([])
plt.title("Word Similarity Network of '{}'\n(Neighbour: {})".format(
root.title(), neighbour))
plt.gcf().set_size_inches(15, 20)
# repel
adjust_text(annotation_list,
add_objects=avoid_list,
force_points=0.25,
precision=0.1,
arrowprops=dict(arrowstyle='<-',
color='k',
alpha=0.1)
)
plt.show()
return hierarchy_dict
hierarchy_dict_pca = plot_word_similarity_network(
word_plot_pca,
root="corona",
neighbour=5,
levels=2)
hierarchy_dict_tsne = plot_word_similarity_network(
word_plot_tsne,
root="corona",
neighbour=5,
levels=2)
hierarchy_dict_pca
Both of the similarity network visualize the same connection, only with a different perspective.
From the visualization, we can analyze the first level of similarity:
-
'ncov', 'korona',
and'covid'
which basically the synonym/epidemic name for'corona'
. -
'china'
as the corona virus was first identified in Wuhan, China. -
'mematikan'
as the number of death cases keep increasing from time to time (source)
Interestingly, the Word2Vec
model can capture the fact that corona is also related to other respiratory problem such as 'sars'
(Severe Acute Respiratory Syndrome) and 'pneumonia'
(lung infection) just within two levels of similarity.
Conclusion
In this post, we successfully scrape thousands of article from tirto.id and detik.com. From there, we analyze the seasonality present in the posting pattern plot and also train a Word2Vec
model to capture the context of words. There are several things that could be improved in the future:
- Predict the posting pattern using time series forecasting.
- Increase corpus size for a more robust model.
- Using skip-gram architecture for
Word2Vec
model, and compare it with CBOW (Continuous Bag-of-Words).