Machine Learning
Table of Contents
๐ฆ Import Libraries⌗
import numpy as np
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
-
numpy
,pandas
: Data manipulation -
sklearn
: ML tools like vectorizers and train/test split -
matplotlib
: Visualization (optional)
๐ NLP Preprocessing Setup⌗
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
-
re
: Regex for text cleaning -
nltk
: Tokenization, stopwords, and stemming
๐ซ Custom Stopwords⌗
negatives = ['no', 'nor', 'not', "don't", ...]
all_stopwords = [w for w in stopwords.words('english') if w not in negatives]
Retains negative words like “not”, which are important for sentiment.
๐ Load Dataset⌗
df = pd.read_csv('../Data/review_dataset.tsv', sep="\t")
X = df['Review'].values
Y = df['Liked'].values
-
X = text reviews
-
Y = labels (0 = negative, 1 = positive)
๐งน Text Preprocessing Function⌗
def preprocessing(data):
corpus = []
for sen in data:
clean = re.sub('[^a-zA-Z ]', '', sen.lower())
tokens = [w for w in clean.split() if w not in all_stopwords]
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in tokens]
corpus.append(" ".join(stemmed))
return corpus
-
Lowercase
-
Remove punctuation
-
Remove stopwords (except negatives)
-
Stem each word
โ๏ธ Vectorization (Bag of Words)⌗
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_cv_data = cv.fit_transform(corpus).toarray()
-
Converts text to numeric format
-
Each word is a feature (column)
๐ Convert to PyTorch Tensors⌗
X = torch.tensor(X_cv_data, dtype=torch.float32)
Y = torch.tensor(Y, dtype=torch.long)
๐ฒ Train-Test Split⌗
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
๐ง Define the Model⌗
class SentimentAnalysis(nn.Module):
def __init__(self, input_dim):
super().__init__()
self.l1 = nn.Linear(input_dim, 60)
self.l2 = nn.Linear(60, 10)
self.l3 = nn.Linear(10, 2)
def forward(self, x):
x = torch.relu(self.l1(x))
x = torch.relu(self.l2(x))
return self.l3(x)
โ๏ธ Model Setup⌗
model = SentimentAnalysis(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
๐๏ธ Training the Model⌗
for epoch in range(50):
model.train()
total_loss = 0
for i in range(0, len(X_train), 20):
batch_X = X_train[i:i+20]
batch_Y = Y_train[i:i+20]
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_Y)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
๐ Model Evaluation⌗
model.eval()
with torch.no_grad():
val_output = model(X_test)
_, predicted = torch.max(val_output.data, 1)
accuracy = (predicted == Y_test).float().mean()
print(f'Validation Accuracy: {accuracy:.4f}')
๐ Predict New Sentences⌗
def call_model(sen):
print("Sentence:", sen)
clean = preprocessing([sen])
sent = cv.transform(clean).toarray()
tensor_input = torch.tensor(sent, dtype=torch.float32)
pred = model(tensor_input)
predicted_class = torch.argmax(pred.data, dim=1)
print("Predicted Class:", "positive" if predicted_class == 1 else "negative")
Use this function to classify new reviews with the trained model.
Read other posts