Predicting Movie Review Sentiment with TensorFlow and TensorBoard
Ref: http://www.cnblogs.com/libinggen/p/6939577.html
使用LSTM的原因之一是: 解决RNN Deep Network的Gradient错误累积太多,以至于Gradient归零或者成为无穷大,所以无法继续进行优化的问题。
Thanks to Jürgen Schmidhuber
Using the data from an old Kaggle competition “Bag of Words Meets Bags of Popcorn”
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from collections import namedtuple
Preprocessing
The data is formatted as .tsv
- remove stopwords
- Convert words to lower case
def clean_text(text, remove_stopwords=True): '''Clean the text, with the option to remove stopwords''' # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"<br />", " ", text) text = re.sub(r"[^a-z]", " ", text) text = re.sub(r" ", " ", text) # Remove any extra spaces text = re.sub(r" ", " ", text) # Return a list of words return(text)