1、代码

def clean_text(text, remove_stopwords=False):
    """
    数据清洗
    """
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

def to_review_vector(review):
    """
    获取词向量
    """
    global word_vec
    
    review = clean_text(review, remove_stopwords=True)
    #print (review)
    #words = nltk.word_tokenize(review)
    word_vec = np.zeros((1,300))
    for word in review:
        #word_vec = np.zeros((1,300))
        if word in model:
            word_vec += np.array([model[word]]) 
    #print (word_vec.mean(axis = 0))
    return pd.Series(word_vec.mean(axis = 0))

 

相关文章:

  • 2021-08-04
  • 2021-04-13
  • 2021-07-08
  • 2021-06-28
  • 2021-04-01
  • 2021-06-12
  • 2021-09-15
猜你喜欢
  • 2022-01-20
  • 2021-11-14
  • 2021-11-06
  • 2021-07-05
  • 2021-07-24
  • 2021-05-20
相关资源
相似解决方案