-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimdb_training_conv1.py
More file actions
141 lines (112 loc) · 4.25 KB
/
imdb_training_conv1.py
File metadata and controls
141 lines (112 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import pickle
# Hides the GPU from TensorFlow
tf.config.set_visible_devices([], 'GPU')
# Base path for the dataset
dataset_path = 'data/aclImdb'
train_dataset = keras.utils.text_dataset_from_directory(
os.path.expanduser(os.path.join(dataset_path, 'train')),
class_names=['neg', 'pos'],
batch_size=32
)
valid_dataset = keras.utils.text_dataset_from_directory(
os.path.expanduser(os.path.join(dataset_path, 'test')),
batch_size=32
)
# 1. Prepare text data from dataset
texts = []
labels = []
for text_batch, label_batch in train_dataset:
for text, label in zip(text_batch.numpy(), label_batch.numpy()):
texts.append(text.decode('utf-8'))
labels.append([label]) # Convert to list for consistency
print(f"Number of training examples: {len(texts)}")
print(f"Example text: {texts[0][:100]}...")
print(f"Example label: {labels[0]}")
# 2. Tokenize and pad
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)
y = np.array(labels)
# After fitting the tokenizer
# Save it
with open("tokenizer1.pkl", "wb") as f:
pickle.dump(tokenizer, f)
# 3. Load GloVe embeddings
embedding_dim = 50
embeddings_index = {}
glove_path = 'glove.6B.50d.txt'
with open(glove_path, encoding='utf8') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
# 4. Prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
if i >= max_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
## CHANGE FROM HERE ONWARDS
# 5. Build a simple model
model = models.Sequential([
layers.InputLayer(input_shape=(max_len,)),
layers.Embedding(
input_dim=num_words,
output_dim=embedding_dim,
weights=[embedding_matrix],
trainable=False
),
layers.Conv1D(64, 3, activation='relu'), # Add local feature extraction
layers.GlobalMaxPooling1D(), # Max pooling captures the most important features
layers.Dropout(0.2), # Add regularization to prevent overfitting
layers.Dense(32, activation='relu'), # Increase from 16 to 32
layers.Dense(16, activation='relu'), # Add another layer
layers.Dropout(0.2), # Additional dropout
layers.Dense(y.shape[1], activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy', 'AUC', 'Precision', 'Recall'] # Added AUC metric for better evaluation
)
model.summary()
# 6. Train
early_stop = EarlyStopping(monitor='val_auc', mode='max', patience=2, restore_best_weights=True)
model.fit(X, y, epochs=20, verbose=1, validation_split=0.2, callbacks=[early_stop])
# 6.B STORE MODEL
model.save('imdb_conv1.keras')
#7. Prediction Pipeline
def prediction_pipeline(text, model, tokenizer, max_len):
"""
Pipeline function that handles all preprocessing steps and returns the sentiment.
Args:
text (str): Input text to predict
model: Trained model
Returns:
str: Either "positive" or "negative" sentiment
"""
sequence = tokenizer.texts_to_sequences([text])
padded = pad_sequences(sequence, maxlen=max_len)
prediction = model.predict(padded, verbose=0)[0][0]
return "positive" if prediction > 0.5 else "negative"
# Example usage
text = "This movie was fantastic! I loved it."
sentiment = prediction_pipeline(text, model, tokenizer, max_len)
print(f"Sentiment: {sentiment}")