导包
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.text import Text
from nltk import ngrams,FreqDist
读数据
data = pd.read_csv(r'D:\数据\亚马逊搜索词排名\asin.csv',encoding = 'gb18030')
women = data.loc[data['catelogy'].str.startswith('Women')]
men = data.loc[data['catelogy'].str.startswith('Men')]
boy = data.loc[data['catelogy'].str.startswith('Boy')]
girl = data.loc[data['catelogy'].str.startswith('Girl')]
停用词设置
english_stopwords = stopwords.words("english")
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*', '…',"'s","''",'&','&','&','‘','’',
'”','~',"''",'/',"'",'|','|','+','-','``']
english_stopwords.extend(english_punctuations)
构建单词处理和统计函数
def process_keywords(keywords,english_stopwords):
#分词
words = []
for i in keywords:
words.extend(word_tokenize(str(i)))
words_lower = [i.lower() for i in words]
#去除停用词
words_clear = [i for i in words_lower if i not in english_stopwords]
#词干化处理,并恢复成统一格式
st = PorterStemmer()
words_stem = [st.stem(i) for i in words_clear]
words_dicts = [{st.stem(i):i} for i in words_clear]
words_dict = {}
for i in words_dicts:
words_dict.update(i)
words_antistem = [words_dict[i]for i in words_stem]
#统计出现次数
all_counts = dict()
for size in 1,2,3,4:
all_counts[size] = FreqDist(ngrams(words_antistem,size))
return all_counts
统计各个目录不同数量词组组合出现的次数
counts = pd.DataFrame()
for ct in [women,men,boy,girl]:
catelogy = ct.catelogy.unique()
for i in catelogy:
count_df = pd.DataFrame()
keywords = ct.loc[ct['catelogy']==i,'item_title']
all_counts = process_keywords(keywords,english_stopwords)
phrase = [j for size in all_counts.keys() for j in all_counts[size]]
times = [all_counts[size][j] for size in all_counts.keys() for j in all_counts[size]]
count_df['词组'] = phrase
count_df['次数'] = times
count_df['目录'] = i
counts = pd.concat([counts,count_df],sort = False)
counts['词组数'] = counts['词组'].map(lambda x:len(x))
counts['词组'] = counts['词组'].map(sorted)
counts['词组'] = counts['词组'].astype(str)
df = counts.groupby(['目录','词组','词组数'],as_index = False)['次数'].sum().sort_values(by = ['目录','词组数','次数'],ascending = [True,True,False])
结果如下
补充
FreqDist继承自dict,可以像操作字典一样操作FreqDist对象。FreqDist中的键为单词,值为单词的出现总次数。
ngrams可以实现抽取连续2grams,3grams,4grams等。返回的是迭代器