PyTorch: Complete Step-by-Step Guide
Table of Contents
- Introduction & Setup
- Tensors - The Foundation
- Autograd - Automatic Differentiation
- Neural Networks with nn.Module
- Training Loop
- CNN Example
- RNN/LSTM Example
- Transfer Learning
- Interview Questions
1. Introduction & Setup {#introduction}
What is PyTorch?
PyTorch is an open-source machine learning library developed by Facebook's AI Research lab. It provides:
- Dynamic computational graphs
- Strong GPU acceleration
- Rich ecosystem for deep learning
Installation
# CPU only
pip install torch torchvision
# With CUDA (GPU)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
Basic Import
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.transforms as transforms
2. Tensors - The Foundation {#tensors}
Creating Tensors
# From data
tensor_from_list = torch.tensor([1, 2, 3, 4])
tensor_from_numpy = torch.from_numpy(np.array([1, 2, 3]))
# Random tensors
random_tensor = torch.rand(3, 4) # Uniform distribution [0, 1)
normal_tensor = torch.randn(3, 4) # Normal distribution
zeros = torch.zeros(3, 4)
ones = torch.ones(3, 4)
# With specific dtype and device
tensor_float = torch.tensor([1, 2, 3], dtype=torch.float32)
tensor_gpu = torch.tensor([1, 2, 3], device='cuda')
Tensor Operations
# Basic operations
a = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
b = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32)
# Arithmetic
c = a + b # Element-wise addition
d = a * b # Element-wise multiplication
e = a @ b # Matrix multiplication (or torch.matmul)
# Reshaping
x = torch.randn(4, 6)
y = x.view(2, 12) # Reshape to 2x12
z = x.view(-1, 3) # Automatically calculate first dimension
# Indexing and Slicing
tensor = torch.randn(3, 4)
first_row = tensor[0]
first_col = tensor[:, 0]
sub_tensor = tensor[1:, :2]
GPU Operations
# Check if CUDA is available
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
# Move tensors to GPU
tensor_cpu = torch.randn(3, 4)
tensor_gpu = tensor_cpu.to(device)
# Or create directly on GPU
tensor_gpu = torch.randn(3, 4, device=device)
3. Autograd - Automatic Differentiation {#autograd}
Basic Autograd
# Create tensor with gradient tracking
x = torch.randn(3, requires_grad=True)
y = x * 2
z = y * y * 3
out = z.mean()
# Compute gradients
out.backward()
print(x.grad) # dout/dx
# Stop gradient tracking
with torch.no_grad():
y = x * 2 # No gradient computation
Gradient Examples
# Example: Simple linear function
x = torch.tensor([1., 2., 3.], requires_grad=True)
y = 2 * x + 3
y.backward(torch.tensor([1., 1., 1.])) # dy/dx = 2
print(x.grad) # tensor([2., 2., 2.])
# Reset gradients
x.grad.zero_()
# Example: More complex function
x = torch.tensor(2., requires_grad=True)
y = x ** 2 + 2 * x + 1
y.backward()
print(x.grad) # dy/dx = 2x + 2 = 6
4. Neural Networks with nn.Module {#neural-networks}
Simple Neural Network
class SimpleNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(SimpleNN, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
# Create model
model = SimpleNN(input_size=10, hidden_size=20, output_size=3)
print(model)
# Forward pass
input_data = torch.randn(5, 10) # Batch of 5 samples
output = model(input_data)
print(output.shape) # torch.Size([5, 3])
More Complex Network
class DeepNN(nn.Module):
def __init__(self, num_classes=10):
super(DeepNN, self).__init__()
self.features = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(128, 64),
nn.ReLU()
)
self.classifier = nn.Linear(64, num_classes)
def forward(self, x):
x = x.view(x.size(0), -1) # Flatten
x = self.features(x)
x = self.classifier(x)
return x
5. Training Loop {#training-loop}
Complete Training Example
# Dataset and DataLoader
class CustomDataset(Dataset):
def __init__(self, data, targets):
self.data = data
self.targets = targets
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx], self.targets[idx]
# Create dummy data
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))
# Create dataset and dataloader
dataset = CustomDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Model, Loss, Optimizer
model = DeepNN(num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
def train_epoch(model, dataloader, criterion, optimizer, device):
model.train()
total_loss = 0
correct = 0
for batch_idx, (data, target) in enumerate(dataloader):
data, target = data.to(device), target.to(device)
# Zero gradients
optimizer.zero_grad()
# Forward pass
output = model(data)
loss = criterion(output, target)
# Backward pass
loss.backward()
# Update weights
optimizer.step()
# Statistics
total_loss += loss.item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
avg_loss = total_loss / len(dataloader)
accuracy = 100. * correct / len(dataloader.dataset)
return avg_loss, accuracy
# Train for multiple epochs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
num_epochs = 10
for epoch in range(num_epochs):
loss, accuracy = train_epoch(model, dataloader, criterion, optimizer, device)
print(f'Epoch {epoch+1}: Loss = {loss:.4f}, Accuracy = {accuracy:.2f}%')
Validation and Testing
def evaluate(model, dataloader, criterion, device):
model.eval()
total_loss = 0
correct = 0
with torch.no_grad():
for data, target in dataloader:
data, target = data.to(device), target.to(device)
output = model(data)
total_loss += criterion(output, target).item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
avg_loss = total_loss / len(dataloader)
accuracy = 100. * correct / len(dataloader.dataset)
return avg_loss, accuracy
6. CNN Example {#cnn-example}
Convolutional Neural Network
class CNN(nn.Module):
def __init__(self, num_classes=10):
super(CNN, self).__init__()
# Convolutional layers
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
# Pooling
self.pool = nn.MaxPool2d(2, 2)
# Fully connected layers
self.fc1 = nn.Linear(128 * 4 * 4, 512)
self.fc2 = nn.Linear(512, num_classes)
# Dropout
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# Conv block 1
x = self.pool(F.relu(self.conv1(x)))
# Conv block 2
x = self.pool(F.relu(self.conv2(x)))
# Conv block 3
x = self.pool(F.relu(self.conv3(x)))
# Flatten
x = x.view(-1, 128 * 4 * 4)
# FC layers
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
# Example with CIFAR-10
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
# Initialize and train
cnn_model = CNN(num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cnn_model.parameters(), lr=0.001, momentum=0.9)
7. RNN/LSTM Example {#rnn-example}
LSTM for Text Classification
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
n_layers=2, dropout=0.5):
super(LSTMClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim,
num_layers=n_layers,
dropout=dropout,
batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
# text shape: (batch_size, sequence_length)
embedded = self.embedding(text)
# embedded shape: (batch_size, sequence_length, embedding_dim)
output, (hidden, cell) = self.lstm(embedded)
# hidden shape: (n_layers, batch_size, hidden_dim)
# Use last hidden state
hidden = hidden[-1] # Get last layer
# Apply dropout and FC
output = self.dropout(hidden)
output = self.fc(output)
return output
# Example usage
vocab_size = 10000
model = LSTMClassifier(vocab_size,
embedding_dim=100,
hidden_dim=256,
output_dim=2, # Binary classification
n_layers=2)
# Example input (batch of tokenized sentences)
input_text = torch.randint(0, vocab_size, (32, 50)) # 32 sentences, 50 tokens each
output = model(input_text)
print(output.shape) # torch.Size([32, 2])
8. Transfer Learning {#transfer-learning}
Using Pretrained Models
import torchvision.models as models
# Load pretrained ResNet
resnet = models.resnet50(pretrained=True)
# Freeze all layers
for param in resnet.parameters():
param.requires_grad = False
# Replace final layer for new task
num_features = resnet.fc.in_features
resnet.fc = nn.Linear(num_features, 10) # 10 classes
# Only train the final layer
optimizer = optim.Adam(resnet.fc.parameters(), lr=0.001)
# Fine-tuning: Unfreeze some layers
for param in resnet.layer4.parameters():
param.requires_grad = True
# Now optimize all unfrozen parameters
optimizer = optim.Adam(filter(lambda p: p.requires_grad,
resnet.parameters()), lr=0.0001)
9. Interview Questions {#interview-questions}
Basic Questions
Q1: What is the difference between tensor.view() and tensor.reshape()?
# Answer: view() requires contiguous memory, reshape() doesn't
x = torch.randn(2, 3)
y = x.t() # Transpose makes it non-contiguous
# z = y.view(6) # This would fail
z = y.reshape(6) # This works
z = y.contiguous().view(6) # This also works
Q2: Explain requires_grad and how to freeze layers
# requires_grad=True enables gradient computation
x = torch.randn(3, requires_grad=True)
# Freeze layers:
for param in model.parameters():
param.requires_grad = False
Q3: What's the difference between nn.Module and nn.functional?
# nn.Module: Stateful, has parameters
self.conv = nn.Conv2d(3, 64, 3) # Has learnable weights
# nn.functional: Stateless, functional interface
output = F.conv2d(input, weight, bias) # You provide weights
Intermediate Questions
Q4: Implement a custom loss function
class FocalLoss(nn.Module):
def __init__(self, alpha=1, gamma=2):
super(FocalLoss, self).__init__()
self.alpha = alpha
self.gamma = gamma
def forward(self, inputs, targets):
ce_loss = F.cross_entropy(inputs, targets, reduction='none')
pt = torch.exp(-ce_loss)
focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
return focal_loss.mean()
Q5: Implement gradient clipping
# Clip gradients to prevent exploding gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# Or clip by value
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=1.0)
Q6: Explain and implement batch normalization
class ModelWithBN(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(100, 50)
self.bn1 = nn.BatchNorm1d(50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = self.fc1(x)
x = self.bn1(x) # Normalize before activation
x = F.relu(x)
x = self.fc2(x)
return x
Advanced Questions
Q7: Implement a custom Dataset with data augmentation
class CustomImageDataset(Dataset):
def __init__(self, image_paths, labels, transform=None):
self.image_paths = image_paths
self.labels = labels
self.transform = transform
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
image = Image.open(self.image_paths[idx])
label = self.labels[idx]
if self.transform:
image = self.transform(image)
return image, label
# With augmentation
transform = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(10),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
Q8: Implement learning rate scheduling
# Different schedulers
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
patience=10, factor=0.1)
# Custom scheduler
class WarmupScheduler(optim.lr_scheduler._LRScheduler):
def __init__(self, optimizer, warmup_epochs, last_epoch=-1):
self.warmup_epochs = warmup_epochs
super().__init__(optimizer, last_epoch)
def get_lr(self):
if self.last_epoch < self.warmup_epochs:
return [base_lr * (self.last_epoch + 1) / self.warmup_epochs
for base_lr in self.base_lrs]
return self.base_lrs
Q9: Implement attention mechanism
class Attention(nn.Module):
def __init__(self, hidden_dim):
super(Attention, self).__init__()
self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
self.v = nn.Linear(hidden_dim, 1, bias=False)
def forward(self, hidden, encoder_outputs):
# hidden: (batch_size, hidden_dim)
# encoder_outputs: (batch_size, seq_len, hidden_dim)
batch_size = encoder_outputs.size(0)
seq_len = encoder_outputs.size(1)
# Repeat hidden state
hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
# Concatenate
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
# Compute attention scores
attention = self.v(energy).squeeze(2)
attention_weights = F.softmax(attention, dim=1)
# Apply attention
context = attention_weights.unsqueeze(2) * encoder_outputs
context = context.sum(dim=1)
return context, attention_weights
Q10: Mixed precision training
from torch.cuda.amp import autocast, GradScaler
model = Model().cuda()
optimizer = optim.Adam(model.parameters())
scaler = GradScaler()
for epoch in range(num_epochs):
for batch in dataloader:
optimizer.zero_grad()
# Mixed precision
with autocast():
output = model(batch['input'])
loss = criterion(output, batch['target'])
# Scale loss and backward
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
Common Debugging Tips
- Shape mismatches: Always print tensor shapes
print(f"Input shape: {x.shape}")
x = self.layer(x)
print(f"After layer shape: {x.shape}")
- Gradient flow issues:
# Check if gradients are flowing
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad_norm = {param.grad.norm()}")
- Memory issues:
# Clear cache
torch.cuda.empty_cache()
# Check memory usage
print(f"Allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
print(f"Cached: {torch.cuda.memory_reserved()/1024**2:.2f} MB")
This guide covers the essential concepts of PyTorch from basics to advanced topics. Practice implementing these examples and understanding the concepts for interviews!
Comments
Post a Comment