Susan Li作者和中华校对吴金笛翻译

# 文本数据探索性数据分析结合可视化和NLP产生见解（附代码）

Photo credit: Pixabay文本文档内容的可视化表示是文本挖掘领域中最重要的任务之一。作为一名数据科学家或NLP专家，我们不仅要从不同方面和不同细节层面来探索文档的内容，还要总结单个文档，显示单词和主题，检测事件，以及创建故事情节。

#### 数据

1. df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')

1. df.drop('Unnamed: 0', axis=1, inplace=True)

2. df.drop('Title', axis=1, inplace=True)

3. df = df[~df['Review Text'].isnull()]

4.

5. def preprocess(ReviewText):

6.     ReviewText = ReviewText.str.replace("(

7. )", "")

8.     ReviewText = ReviewText.str.replace('().*()', '')

9.     ReviewText = ReviewText.str.replace('(&)', '')

10.     ReviewText = ReviewText.str.replace('(>)', '')

11.     ReviewText = ReviewText.str.replace('(<)', '')

12.     ReviewText = ReviewText.str.replace('(\xa0)', ' ')

13.     return ReviewText

14. df['Review Text'] = preprocess(df['Review Text'])

15.

16. df['polarity'] = df['Review Text'].map(lambda text: TextBlob(text).sentiment.polarity)

17. df['review_len'] = df['Review Text'].astype(str).apply(len)

18. df['word_count'] = df['Review Text'].apply(lambda x: len(str(x).split()))

text_preprocessing.py

1. print('5 random reviews with the highest positive sentiment polarity: \n')

2. cl = df.loc[df.polarity == 1, ['Review Text']].sample(5).values

3. for c in cl:

4.     print(c[0])

1. print('5 random reviews with the most neutral sentiment(zero) polarity: \n')

2. cl = df.loc[df.polarity == 0, ['Review Text']].sample(5).values

3. for c in cl:

4.     print(c[0])

1. print('2 reviews with the most negative polarity: \n')

2. cl = df.loc[df.polarity == -0.97500000000000009, ['Review Text']].sample(2).values

3. for c in cl:

4.     print(c[0])

#### 使用Plotly进行单变量可视化

1. df['polarity'].iplot(

2.     kind='hist',

3.     bins=50,

4.     xTitle='polarity',

5.     linecolor='black',

6.     yTitle='count',

7.     title='Sentiment Polarity Distribution')

1. df['Rating'].iplot(

2.     kind='hist',

3.     xTitle='rating',

4.     linecolor='black',

5.     yTitle='count',

6.     title='Review Rating Distribution')

1. df['Age'].iplot(

2.     kind='hist',

3.     bins=50,

4.     xTitle='age',

5.     linecolor='black',

6.     yTitle='count',

7.     title='Reviewers Age Distribution')

1. df['review_len'].iplot(

2.     kind='hist',

3.     bins=100,

4.     xTitle='review length',

5.     linecolor='black',

6.     yTitle='count',

7.     title='Review Text Length Distribution')

1. df['word_count'].iplot(

2.     kind='hist',

3.     bins=100,

4.     xTitle='word count',

5.     linecolor='black',

6.     yTitle='count',

7.     title='Review Text Word Count Distribution')

division的分布

1.df.groupby('DivisionName').count(['Clothing ID'].iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8, title='Bar chart of Division Name', xTitle='Division Name')

1.df.groupby('DepartmentName').count(['Clothing ID'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8, title='Bar chart of Department Name', xTitle='Department Name')

1..groupby('Classame').count(['Clothing ID'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8, title='Bar chart of Class Name', xTitle='Class Name')

1. def get_top_n_words(corpus, n=None):

2.     vec = CountVectorizer().fit(corpus)

3.     bag_of_words = vec.transform(corpus)

4.     sum_words = bag_of_words.sum(axis=0)

5.     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

6.     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

7.     return words_freq[:n]

8. common_words = get_top_n_words(df['Review Text'], 20)

9. for word, freq in common_words:

10.     print(word, freq)

11. df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

12. df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(

13. kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review before removing stop words')

top_unigram.py

1. def get_top_n_words(corpus, n=None):

2.     vec = CountVectorizer(stop_words = 'english').fit(corpus)

3.     bag_of_words = vec.transform(corpus)

4.     sum_words = bag_of_words.sum(axis=0)

5.     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

6.     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

7.     return words_freq[:n]

8. common_words = get_top_n_words(df['Review Text'], 20)

9. for word, freq in common_words:

10.     print(word, freq)

11. df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

12. df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(

13. kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review after removing stop words')

top_unigram_no_stopwords.py

1. def get_top_n_bigram(corpus, n=None):

2.     vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)

3.     bag_of_words = vec.transform(corpus)

4.     sum_words = bag_of_words.sum(axis=0)

5.     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

6.     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

7.     return words_freq[:n]

8. common_words = get_top_n_bigram(df['Review Text'], 20)

9. for word, freq in common_words:

10.     print(word, freq)

11. df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

12. df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(

13. kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review before removing stop words')

top_bigram.py

1. def get_top_n_bigram(corpus, n=None):

2.     vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)

3.     bag_of_words = vec.transform(corpus)

4.     sum_words = bag_of_words.sum(axis=0)

5.     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

6.     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

7.     return words_freq[:n]

8. common_words = get_top_n_bigram(df['Review Text'], 20)

9. for word, freq in common_words:

10.     print(word, freq)

11. df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

12. df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(

13. kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review after removing stop words')

top_bigram_no_stopwords.py

1. def get_top_n_trigram(corpus, n=None):

2.     vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)

3.     bag_of_words = vec.transform(corpus)

4.     sum_words = bag_of_words.sum(axis=0)

5.     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

6.     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

7.     return words_freq[:n]

8. common_words = get_top_n_trigram(df['Review Text'], 20)

9. for word, freq in common_words:

10.     print(word, freq)

11. df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

12. df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(

13. kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in review before removing stop words')

top_trigram.py

1. def get_top_n_trigram(corpus, n=None):

2.     vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)

3.     bag_of_words = vec.transform(corpus)

4.     sum_words = bag_of_words.sum(axis=0)

5.     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

6.     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

7.     return words_freq[:n]

8. common_words = get_top_n_trigram(df['Review Text'], 20)

9. for word, freq in common_words:

10.     print(word, freq)

11. df6 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

12. df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(

13. kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in review after removing stop words')

top_trigram_no_stopwords.py

1. blob = TextBlob(str(df['Review Text']))

2. pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])

3. pos_df = pos_df.pos.value_counts()[:20]

4. pos_df.iplot(

5.     kind='bar',

6.     xTitle='POS',

7.     yTitle='count',

8. title='Top 20 Part-of-speech tagging for review corpus')

POS.py

1. y0 = df.loc[df['Department Name'] == 'Tops']['polarity']

2. y1 = df.loc[df['Department Name'] == 'Dresses']['polarity']

3. y2 = df.loc[df['Department Name'] == 'Bottoms']['polarity']

4. y3 = df.loc[df['Department Name'] == 'Intimate']['polarity']

5. y4 = df.loc[df['Department Name'] == 'Jackets']['polarity']

6. y5 = df.loc[df['Department Name'] == 'Trend']['polarity']

7.

8. trace0 = go.Box(

9.     y=y0,

10.     name = 'Tops',

11.     marker = dict(

12.         color = 'rgb(214, 12, 140)',

13.     )

14. )

15. trace1 = go.Box(

16.     y=y1,

17.     name = 'Dresses',

18.     marker = dict(

19.         color = 'rgb(0, 128, 128)',

20.     )

21. )

22. trace2 = go.Box(

23.     y=y2,

24.     name = 'Bottoms',

25.     marker = dict(

26.         color = 'rgb(10, 140, 208)',

27.     )

28. )

29. trace3 = go.Box(

30.     y=y3,

31.     name = 'Intimate',

32.     marker = dict(

33.         color = 'rgb(12, 102, 14)',

34.     )

35. )

36. trace4 = go.Box(

37.     y=y4,

38.     name = 'Jackets',

39.     marker = dict(

40.         color = 'rgb(10, 0, 100)',

41.     )

42. )

43. trace5 = go.Box(

44.     y=y5,

45.     name = 'Trend',

46.     marker = dict(

47.         color = 'rgb(100, 0, 10)',

48.     )

49. )

50. data = [trace0, trace1, trace2, trace3, trace4, trace5]

51. layout = go.Layout(

52.     title = "Sentiment Polarity Boxplot of Department Name"

53. )

54.

55. fig = go.Figure(data=data,layout=layout)

56. iplot(fig, filename = "Sentiment Polarity Boxplot of Department Name")

department_polarity.py

1. y0 = df.loc[df['Department Name'] == 'Tops']['Rating']

2. y1 = df.loc[df['Department Name'] == 'Dresses']['Rating']

3. y2 = df.loc[df['Department Name'] == 'Bottoms']['Rating']

4. y3 = df.loc[df['Department Name'] == 'Intimate']['Rating']

5. y4 = df.loc[df['Department Name'] == 'Jackets']['Rating']

6. y5 = df.loc[df['Department Name'] == 'Trend']['Rating']

7.

8. trace0 = go.Box(

9.     y=y0,

10.     name = 'Tops',

11.     marker = dict(

12.         color = 'rgb(214, 12, 140)',

13.     )

14. )

15. trace1 = go.Box(

16.     y=y1,

17.     name = 'Dresses',

18.     marker = dict(

19.         color = 'rgb(0, 128, 128)',

20.     )

21. )

22. trace2 = go.Box(

23.     y=y2,

24.     name = 'Bottoms',

25.     marker = dict(

26.         color = 'rgb(10, 140, 208)',

27.     )

28. )

29. trace3 = go.Box(

30.     y=y3,

31.     name = 'Intimate',

32.     marker = dict(

33.         color = 'rgb(12, 102, 14)',

34.     )

35. )

36. trace4 = go.Box(

37.     y=y4,

38.     name = 'Jackets',

39.     marker = dict(

40.         color = 'rgb(10, 0, 100)',

41.     )

42. )

43. trace5 = go.Box(

44.     y=y5,

45.     name = 'Trend',

46.     marker = dict(

47.         color = 'rgb(100, 0, 10)',

48.     )

49. )

50. data = [trace0, trace1, trace2, trace3, trace4, trace5]

51. layout = go.Layout(

52.     title = "Rating Boxplot of Department Name"

53. )

54.

55. fig = go.Figure(data=data,layout=layout)

56. iplot(fig, filename = "Rating Boxplot of Department Name")

rating_division.py

1. y0 = df.loc[df['Department Name'] == 'Tops']['review_len']

2. y1 = df.loc[df['Department Name'] == 'Dresses']['review_len']

3. y2 = df.loc[df['Department Name'] == 'Bottoms']['review_len']

4. y3 = df.loc[df['Department Name'] == 'Intimate']['review_len']

5. y4 = df.loc[df['Department Name'] == 'Jackets']['review_len']

6. y5 = df.loc[df['Department Name'] == 'Trend']['review_len']

7.

8. trace0 = go.Box(

9.     y=y0,

10.     name = 'Tops',

11.     marker = dict(

12.         color = 'rgb(214, 12, 140)',

13.     )

14. )

15. trace1 = go.Box(

16.     y=y1,

17.     name = 'Dresses',

18.     marker = dict(

19.         color = 'rgb(0, 128, 128)',

20.     )

21. )

22. trace2 = go.Box(

23.     y=y2,

24.     name = 'Bottoms',

25.     marker = dict(

26.         color = 'rgb(10, 140, 208)',

27.     )

28. )

29. trace3 = go.Box(

30.     y=y3,

31.     name = 'Intimate',

32.     marker = dict(

33.         color = 'rgb(12, 102, 14)',

34.     )

35. )

36. trace4 = go.Box(

37.     y=y4,

38.     name = 'Jackets',

39.     marker = dict(

40.         color = 'rgb(10, 0, 100)',

41.     )

42. )

43. trace5 = go.Box(

44.     y=y5,

45.     name = 'Trend',

46.     marker = dict(

47.         color = 'rgb(100, 0, 10)',

48.     )

49. )

50. data = [trace0, trace1, trace2, trace3, trace4, trace5]

51. layout = go.Layout(

52.     title = "Review length Boxplot of Department Name"

53. )

54.

55. fig = go.Figure(data=data,layout=layout)

56. iplot(fig, filename = "Review Length Boxplot of Department Name")

length_department.py

#### 使用Plotly进行双变量可视化

1. x1 = df.loc[df['Recommended IND'] == 1, 'polarity']

2. x0 = df.loc[df['Recommended IND'] == 0, 'polarity']

3.

4. trace1 = go.Histogram(

5.     x=x0, name='Not recommended',

6.     opacity=0.75

7. )

8. trace2 = go.Histogram(

9.     x=x1, name = 'Recommended',

10.     opacity=0.75

11. )

12.

13. data = [trace1, trace2]

14. layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity of reviews based on Recommendation')

15. fig = go.Figure(data=data, layout=layout)

16.

17. iplot(fig, filename='overlaid histogram')

polarity_recommendation.py

1. x1 = df.loc[df['Recommended IND'] == 1, 'Rating']

2. x0 = df.loc[df['Recommended IND'] == 0, 'Rating']

3.

4. trace1 = go.Histogram(

5.     x=x0, name='Not recommended',

6.     opacity=0.75

7. )

8. trace2 = go.Histogram(

9.     x=x1, name = 'Recommended',

10.     opacity=0.75

11. )

12.

13. data = [trace1, trace2]

14. layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity of reviews based on Recommendation')

15. fig = go.Figure(data=data, layout=layout)

16.

17. iplot(fig, filename='overlaid histogram')

rating_recommendation.py

1. x1 = df.loc[df['Recommended IND'] == 1, 'review_len']

2. x0 = df.loc[df['Recommended IND'] == 0, 'review_len']

3.

4. trace1 = go.Histogram(

5.     x=x0, name='Not recommended',

6.     opacity=0.75

7. )

8. trace2 = go.Histogram(

9.     x=x1, name = 'Recommended',

10.     opacity=0.75

11. )

12.

13. data = [trace1, trace2]

14. layout = go.Layout(barmode = 'group', title='Distribution of Review Lengths Based on Recommendation')

15. fig = go.Figure(data=data, layout=layout)

16.

17. iplot(fig, filename='stacked histogram')

review_length_recommend.py

1. trace1 = go.Scatter(

2.     x=df['polarity'], y=df['Rating'], mode='markers', name='points',

3.     marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)

4. )

5. trace2 = go.Histogram2dContour(

6.     x=df['polarity'], y=df['Rating'], name='density', ncontours=20,

7.     colorscale='Hot', reversescale=True, showscale=False

8. )

9. trace3 = go.Histogram(

10.     x=df['polarity'], name='Sentiment polarity density',

11.     marker=dict(color='rgb(102,0,0)'),

12.     yaxis='y2'

13. )

14. trace4 = go.Histogram(

15.     y=df['Rating'], name='Rating density', marker=dict(color='rgb(102,0,0)'),

16.     xaxis='x2'

17. )

18. data = [trace1, trace2, trace3, trace4]

19.

20. layout = go.Layout(

21.     showlegend=False,

22.     autosize=False,

23.     width=600,

24.     height=550,

25.     xaxis=dict(

26.         domain=[0, 0.85],

27.         showgrid=False,

28.         zeroline=False

29.     ),

30.     yaxis=dict(

31.         domain=[0, 0.85],

32.         showgrid=False,

33.         zeroline=False

34.     ),

35.     margin=dict(

36.         t=50

37.     ),

38.     hovermode='closest',

39.     bargap=0,

40.     xaxis2=dict(

41.         domain=[0.85, 1],

42.         showgrid=False,

43.         zeroline=False

44.     ),

45.     yaxis2=dict(

46.         domain=[0.85, 1],

47.         showgrid=False,

48.         zeroline=False

49.     )

50. )

51.

52. fig = go.Figure(data=data, layout=layout)

53. iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

sentiment_polarity_rating.py

1. trace1 = go.Scatter(

2.     x=df['Age'], y=df['polarity'], mode='markers', name='points',

3.     marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)

4. )

5. trace2 = go.Histogram2dContour(

6.     x=df['Age'], y=df['polarity'], name='density', ncontours=20,

7.     colorscale='Hot', reversescale=True, showscale=False

8. )

9. trace3 = go.Histogram(

10.     x=df['Age'], name='Age density',

11.     marker=dict(color='rgb(102,0,0)'),

12.     yaxis='y2'

13. )

14. trace4 = go.Histogram(

15.     y=df['polarity'], name='Sentiment Polarity density', marker=dict(color='rgb(102,0,0)'),

16.     xaxis='x2'

17. )

18. data = [trace1, trace2, trace3, trace4]

19.

20. layout = go.Layout(

21.     showlegend=False,

22.     autosize=False,

23.     width=600,

24.     height=550,

25.     xaxis=dict(

26.         domain=[0, 0.85],

27.         showgrid=False,

28.         zeroline=False

29.     ),

30.     yaxis=dict(

31.         domain=[0, 0.85],

32.         showgrid=False,

33.         zeroline=False

34.     ),

35.     margin=dict(

36.         t=50

37.     ),

38.     hovermode='closest',

39.     bargap=0,

40.     xaxis2=dict(

41.         domain=[0.85, 1],

42.         showgrid=False,

43.         zeroline=False

44.     ),

45.     yaxis2=dict(

46.         domain=[0.85, 1],

47.         showgrid=False,

48.         zeroline=False

49.     )

50. )

51.

52. fig = go.Figure(data=data, layout=layout)

53. iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

age_polarity.py

#### 寻找特征术语及其关联

”，并使用review Text列中的评论，通过设置text_col参数进行分析。最后，将spaCy模型传递给nlp参数并调用build()来构造语料库

1. corpus = st.CorpusFromPandas(df, category_col='Department Name', text_col='Review Text', nlp=nlp).build()

2. print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

1. term_freq_df = corpus.get_term_freq_df()

2. term_freq_df['Tops Score'] = corpus.get_scaled_f_scores('Tops')

3. pprint(list(term_freq_df.sort_values(by='Tops Score', ascending=False).index[:10]))

1. term_freq_df['Dresses Score'] = corpus.get_scaled_f_scores('Dresses')

2. pprint(list(term_freq_df.sort_values(by='Dresses Score', ascending=False).index[:10]))

#### 图29主题建模评论文本

LSA模型用TF-IDF分数替换文档-术语矩阵中的原始计数。

1. reindexed_data = df['Review Text']

2. tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)

3. reindexed_data = reindexed_data.values

4. document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)

5. n_topics = 6

6. lsa_model = TruncatedSVD(n_components=n_topics)

7. lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)

8.

9. def get_keys(topic_matrix):

10.     '''''

11.     returns an integer list of predicted topic

12.     categories for a given topic matrix

13.     '''

14.     keys = topic_matrix.argmax(axis=1).tolist()

15.     return keys

16.

17. def keys_to_counts(keys):

18.     '''''

19.     returns a tuple of topic categories and their

20.     accompanying magnitudes for a given list of keys

21.     '''

22.     count_pairs = Counter(keys).items()

23.     categories = [pair[0] for pair in count_pairs]

24.     counts = [pair[1] for pair in count_pairs]

25.     return (categories, counts)

26.

27. lsa_keys = get_keys(lsa_topic_matrix)

28. lsa_categories, lsa_counts = keys_to_counts(lsa_keys)

29.

30. def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):

31.     '''''

32.     returns a list of n_topic strings, where each string contains the n most common

33.     words in a predicted category, in order

34.     '''

35.     top_word_indices = []

36.     for topic in range(n_topics):

37.         temp_vector_sum = 0

38.         for i in range(len(keys)):

39.             if keys[i] == topic:

40.                 temp_vector_sum += document_term_matrix[i]

41.         temp_vector_sum = temp_vector_sum.toarray()

42.         top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)

43.         top_word_indices.append(top_n_word_indices)

44.     top_words = []

45.     for topic in top_word_indices:

46.         topic_words = []

47.         for index in topic:

48.             temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))

49.             temp_word_vector[:,index] = 1

50.             the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]

51.             topic_words.append(the_word.encode('ascii').decode('utf-8'))

52.         top_words.append(" ".join(topic_words))

54.

55.     top_n_words_lsa = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)

56.

57. for i in range(len(top_n_words_lsa)):

58. print("Topic {}: ".format(i+1), top_n_words_lsa[i])

topic_model_LSA.py

2. labels = ['Topic {}: \n'.format(i) + top_3_words[i] for i in lsa_categories]

3.

4. fig, ax = plt.subplots(figsize=(16,8))

5. ax.bar(lsa_categories, lsa_counts);

6. ax.set_xticks(lsa_categories);

7. ax.set_xticklabels(labels);

8. ax.set_ylabel('Number of review text');

9. ax.set_title('LSA topic counts');

10. plt.show();

/susanli2016/NLPwithPython/blob/master/

EDA%20and%20visualization%20for%

20Text%20Data.ipynb）上查看。

A Complete Exploratory Data Analysis and Visualization for Text Data: Combine Visualization and NLP to Generate Insights

https://www.kdnuggets.com/2019/05/complete-exploratory-data-analysis-visualization-text-data.html

THU数据派

THU数据派"基于清华，放眼世界"，以扎实的理工功底闯荡“数据江湖”。发布全球大数据资讯，定期组织线下活动，分享前沿产业动态。了解清华大数据，敬请关注姐妹号“数据派THU”。