Auto Byte

Science AI

Susan Li作者和中华校对吴金笛翻译

# 文本数据探索性数据分析结合可视化和NLP产生见解（附代码）

Photo credit: Pixabay文本文档内容的可视化表示是文本挖掘领域中最重要的任务之一。作为一名数据科学家或NLP专家，我们不仅要从不同方面和不同细节层面来探索文档的内容，还要总结单个文档，显示单词和主题，检测事件，以及创建故事情节。

#### 数据

1. df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')

1. df.drop('Unnamed: 0', axis=1, inplace=True)

2. df.drop('Title', axis=1, inplace=True)

3. df = df[~df['Review Text'].isnull()]

4.

5. def preprocess(ReviewText):

6.     ReviewText = ReviewText.str.replace("(

7. )", "")

8.     ReviewText = ReviewText.str.replace('().*()', '')

9.     ReviewText = ReviewText.str.replace('(&)', '')

10.     ReviewText = ReviewText.str.replace('(>)', '')

11.     ReviewText = ReviewText.str.replace('(<)', '')

12.     ReviewText = ReviewText.str.replace('(\xa0)', ' ')

13.     return ReviewText

14. df['Review Text'] = preprocess(df['Review Text'])

15.

16. df['polarity'] = df['Review Text'].map(lambda text: TextBlob(text).sentiment.polarity)

17. df['review_len'] = df['Review Text'].astype(str).apply(len)

18. df['word_count'] = df['Review Text'].apply(lambda x: len(str(x).split()))

text_preprocessing.py

1. print('5 random reviews with the highest positive sentiment polarity: \n')

2. cl = df.loc[df.polarity == 1, ['Review Text']].sample(5).values

3. for c in cl:

4.     print(c[0])

1. print('5 random reviews with the most neutral sentiment(zero) polarity: \n')

2. cl = df.loc[df.polarity == 0, ['Review Text']].sample(5).values

3. for c in cl:

4.     print(c[0])

1. print('2 reviews with the most negative polarity: \n')

2. cl = df.loc[df.polarity == -0.97500000000000009, ['Review Text']].sample(2).values

3. for c in cl:

4.     print(c[0])