为了更加全面地了解酒店旅客的评论是否会对之后酒店的服务产生影响,我爬取了TripAdvisor中一个名为Hilton Hawaiian Village酒店的所有英文评论。这里我不会对爬虫的细节进行展开。
library(dplyr) library(readr) library(lubridate) library(ggplot2) library(tidytext) library(tidyverse) library(stringr) library(tidyr) library(scales) library(broom) library(purrr) library(widyr) library(igraph) library(ggraph) library(SnowballC) library(wordcloud) library(reshape2) theme_set(theme_minimal())
df <- read_csv("Hilton_Hawaiian_Village_Waikiki_Beach_Resort-Honolulu_Oahu_Hawaii__en.csv") df <- df[complete.cases(df), ] df$review_date <- as.Date(df$review_date, format = "%d-%B-%y") dim(df); min(df$review_date); max(df$review_date)
我们在TripAdvisor上一共获得了13,701条关于Hilton Hawaiian Village酒店的英文评论,这些评论的时间范围是从2002–03–21 到2018–08–02。
df %>% count(Week = round_date(review_date, "week")) %>% ggplot(aes(Week, n)) + geom_line() + ggtitle('The Number of Reviews Per Week')
df <- tibble::rowid_to_column(df, "ID") df <- df %>% mutate(review_date = as.POSIXct(review_date, origin = "1970-01-01"),month = round_date(review_date, "month")) review_words <- df %>% distinct(review_body, .keep_all = TRUE) %>% unnest_tokens(word, review_body, drop = FALSE) %>% distinct(ID, word, .keep_all = TRUE) %>% anti_join(stop_words, by = "word") %>% filter(str_detect(word, "[^\\d]")) %>% group_by(word) %>% mutate(word_total = n()) %>% ungroup() word_counts <- review_words %>% count(word, sort = TRUE) word_counts %>% head(25) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n)) + geom_col(fill = "lightblue") + scale_y_continuous(labels = comma_format()) + coord_flip() + labs(title = "Most common words in review text 2002 to date", subtitle = "Among 13,701 reviews; stop words removed", y = "# of uses")
word_counts %>% head(25) %>% mutate(word = wordStem(word)) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n)) + geom_col(fill = "lightblue") + scale_y_continuous(labels = comma_format()) + coord_flip() + labs(title = "Most common words in review text 2002 to date", subtitle = "Among 13,701 reviews; stop words removed and stemmed", y = "# of uses")
所以,在Hilton Hawaiian Village的评论中,哪些是最常见的二元词组呢?
review_bigrams <- df %>% unnest_tokens(bigram, review_body, token = "ngrams", n = 2) bigrams_separated <- review_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE) bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ") bigrams_united %>% count(bigram, sort = TRUE)
最常见的二元词组是“rainbow tower”(彩虹塔),其次是“hawaiian village”(夏威夷村)。
review_subject <- df %>% unnest_tokens(word, review_body) %>% anti_join(stop_words) my_stopwords <- data_frame(word = c(as.character(1:10))) review_subject <- review_subject %>% anti_join(my_stopwords) title_word_pairs <- review_subject %>% pairwise_count(word, ID, sort = TRUE, upper = FALSE) set.seed(1234) title_word_pairs %>% filter(n >= 1000) %>% graph_from_data_frame() %>% ggraph(layout = "fr") + geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") + geom_node_point(size = 5) + geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.2, "lines")) + ggtitle('Word network in TripAdvisor reviews') theme_void()
在网络图中我们发现出现频率最高的几个词存在很强的相关性(“hawaiian”, “village”, “ocean” 和“view”),不过我们没有发现明显的聚集现象。
二元词组有时候还不足以说明情况,让我们来看看TripAdvisor中关于Hilton Hawaiian Village酒店最常见的三元词组有哪些。
review_trigrams <- df %>% unnest_tokens(trigram, review_body, token = "ngrams", n = 3) trigrams_separated <- review_trigrams %>% separate(trigram, c("word1", "word2", "word3"), sep = " ") trigrams_filtered <- trigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) %>% filter(!word3 %in% stop_words$word) trigram_counts <- trigrams_filtered %>% count(word1, word2, word3, sort = TRUE) trigrams_united <- trigrams_filtered %>% unite(trigram, word1, word2, word3, sep = " ") trigrams_united %>% count(trigram, sort = TRUE)
最常见的三元词组是“hilton hawaiian village”,其次是“diamond head tower”,等等。
reviews_per_month <- df %>% group_by(month) %>% summarize(month_total = n()) word_month_counts <- review_words %>% filter(word_total >= 1000) %>% count(word, month) %>% complete(word, month, fill = list(n = 0)) %>% inner_join(reviews_per_month, by = "month") %>% mutate(percent = n / month_total) %>% mutate(year = year(month) + yday(month) / 365) mod <- ~ glm(cbind(n, month_total - n) ~ year, ., family = "binomial") slopes <- word_month_counts %>% nest(-word) %>% mutate(model = map(data, mod)) %>% unnest(map(model, tidy)) %>% filter(term == "year") %>% arrange(desc(estimate)) slopes %>% head(9) %>% inner_join(word_month_counts, by = "word") %>% mutate(word = reorder(word, -estimate)) %>% ggplot(aes(month, n / month_total, color = word)) + geom_line(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + facet_wrap(~ word, scales = "free_y") + expand_limits(y = 0) + labs(x = "Year", y = "Percentage of reviews containing this word", title = "9 fastest growing words in TripAdvisor reviews", subtitle = "Judged by growth rate over 15 years")
在2010年以前我们可以看到大家讨论的焦点是“friday fireworks”(周五的烟花)和“lagoon”(环礁湖)。而在2005年以前“resort fee”(度假费)和“busy”(繁忙)这些词的词频增长最快。
slopes %>% tail(9) %>% inner_join(word_month_counts, by = "word") %>% mutate(word = reorder(word, estimate)) %>% ggplot(aes(month, n / month_total, color = word)) + geom_line(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + facet_wrap(~ word, scales = "free_y") + expand_limits(y = 0) + labs(x = "Year", y = "Percentage of reviews containing this term", title = "9 fastest shrinking words in TripAdvisor reviews", subtitle = "Judged by growth rate over 4 years")
这张图展示了自2010年以来逐渐变少的主题。这些词包括“hhv” (我认为这是 hilton hawaiian village的简称), “breakfast”(早餐), “upgraded”(升级), “prices”(价格) and “free”(免费)。
word_month_counts %>% filter(word %in% c("service", "food")) %>% ggplot(aes(month, n / month_total, color = word)) + geom_line(size = 1, alpha = .8) + scale_y_continuous(labels = percent_format()) + expand_limits(y = 0) + labs(x = "Year", y = "Percentage of reviews containing this term", title = "service vs food in terms of reviewers interest")
reviews <- df %>% filter(!is.na(review_body)) %>% select(ID, review_body) %>% group_by(row_number()) %>% ungroup() tidy_reviews <- reviews %>% unnest_tokens(word, review_body) tidy_reviews <- tidy_reviews %>% anti_join(stop_words) bing_word_counts <- tidy_reviews %>% inner_join(get_sentiments("bing")) %>% count(word, sentiment, sort = TRUE) %>% ungroup() bing_word_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = "free") + labs(y = "Contribution to sentiment", x = NULL) + coord_flip() + ggtitle('Words that contribute to positive and negative sentiment in the reviews')
contributions <- tidy_reviews %>% inner_join(get_sentiments("afinn"), by = "word") %>% group_by(word) %>% summarize(occurences = n(), contribution = sum(score)) contributions %>% top_n(25, abs(contribution)) %>% mutate(word = reorder(word, contribution)) %>% ggplot(aes(word, contribution, fill = contribution > 0)) + ggtitle('Words with the greatest contributions to positive/negative sentiment in reviews') + geom_col(show.legend = FALSE) + coord_flip()
有意思的是,“diamond”(出自“diamond head-钻石头”)被归类为积极情绪。
bigrams_separated %>% filter(word1 == "not") %>% count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn") not_words <- bigrams_separated %>% filter(word1 == "not") %>% inner_join(AFINN, by = c(word2 = "word")) %>% count(word2, score, sort = TRUE) %>% ungroup() not_words
not_words %>% mutate(contribution = n * score) %>% arrange(desc(abs(contribution))) %>% head(20) %>% mutate(word2 = reorder(word2, contribution)) %>% ggplot(aes(word2, n * score, fill = n * score > 0)) + geom_col(show.legend = FALSE) + xlab("Words preceded by \"not\"") + ylab("Sentiment score * number of occurrences") + ggtitle('The 20 words preceded by "not" that had the greatest contribution to sentiment scores, positive or negative direction') + coord_flip()
二元词组“not worth”, “not great”, “not good”, “not recommend”和“not like”是导致错误判断的最大根源,使得评论看起来比原来积极的多。
除了“not”以外,还有其他的否定词会对后面的内容进行情绪的扭转,比如“no”, “never” 和“without”。让我们来看一下具体情况。
negation_words <- c("not", "no", "never", "without") negated_words <- bigrams_separated %>% filter(word1 %in% negation_words) %>% inner_join(AFINN, by = c(word2 = "word")) %>% count(word1, word2, score, sort = TRUE) %>% ungroup() negated_words %>% mutate(contribution = n * score, word2 = reorder(paste(word2, word1, sep = "__"), contribution)) %>% group_by(word1) %>% top_n(12, abs(contribution)) %>% ggplot(aes(word2, contribution, fill = n * score > 0)) + geom_col(show.legend = FALSE) + facet_wrap(~ word1, scales = "free") + scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) + xlab("Words preceded by negation term") + ylab("Sentiment score * # of occurrences") + ggtitle('The most common positive or negative words to follow negations such as "no", "not", "never" and "without"') + coord_flip()
看来导致错判为积极词汇的最大根源来自于“not worth/great/good/recommend”,而另一方面错判为消极词汇的最大根源是“not bad” 和“no problem”。
sentiment_messages <- tidy_reviews %>% inner_join(get_sentiments("afinn"), by = "word") %>% group_by(ID) %>% summarize(sentiment = mean(score), words = n()) %>% ungroup() %>% filter(words >= 5) sentiment_messages %>% arrange(desc(sentiment))
df[ which(df$ID==2363), ]$review_body[1]
sentiment_messages %>% arrange(sentiment)
df[ which(df$ID==3748), ]$review_body[1]
