PyTorch Short Tutorials

PyTorch: Complete Step-by-Step Guide

Introduction & Setup
Tensors - The Foundation
Autograd - Automatic Differentiation
Neural Networks with nn.Module
Training Loop
CNN Example
RNN/LSTM Example
Transfer Learning
Interview Questions

1. Introduction & Setup {#introduction}

What is PyTorch?

PyTorch is an open-source machine learning library developed by Facebook's AI Research lab. It provides:

Dynamic computational graphs
Strong GPU acceleration
Rich ecosystem for deep learning

Installation

# CPU only
pip install torch torchvision

# With CUDA (GPU)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Basic Import

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.transforms as transforms

2. Tensors - The Foundation {#tensors}

Creating Tensors

# From data
tensor_from_list = torch.tensor([1, 2, 3, 4])
tensor_from_numpy = torch.from_numpy(np.array([1, 2, 3]))

# Random tensors
random_tensor = torch.rand(3, 4)  # Uniform distribution [0, 1)
normal_tensor = torch.randn(3, 4)  # Normal distribution
zeros = torch.zeros(3, 4)
ones = torch.ones(3, 4)

# With specific dtype and device
tensor_float = torch.tensor([1, 2, 3], dtype=torch.float32)
tensor_gpu = torch.tensor([1, 2, 3], device='cuda')

Tensor Operations

# Basic operations
a = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
b = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32)

# Arithmetic
c = a + b  # Element-wise addition
d = a * b  # Element-wise multiplication
e = a @ b  # Matrix multiplication (or torch.matmul)

# Reshaping
x = torch.randn(4, 6)
y = x.view(2, 12)  # Reshape to 2x12
z = x.view(-1, 3)  # Automatically calculate first dimension

# Indexing and Slicing
tensor = torch.randn(3, 4)
first_row = tensor[0]
first_col = tensor[:, 0]
sub_tensor = tensor[1:, :2]

GPU Operations

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Move tensors to GPU
tensor_cpu = torch.randn(3, 4)
tensor_gpu = tensor_cpu.to(device)

# Or create directly on GPU
tensor_gpu = torch.randn(3, 4, device=device)

3. Autograd - Automatic Differentiation {#autograd}

Basic Autograd

# Create tensor with gradient tracking
x = torch.randn(3, requires_grad=True)
y = x * 2
z = y * y * 3
out = z.mean()

# Compute gradients
out.backward()
print(x.grad)  # dout/dx

# Stop gradient tracking
with torch.no_grad():
    y = x * 2  # No gradient computation

Gradient Examples

# Example: Simple linear function
x = torch.tensor([1., 2., 3.], requires_grad=True)
y = 2 * x + 3
y.backward(torch.tensor([1., 1., 1.]))  # dy/dx = 2
print(x.grad)  # tensor([2., 2., 2.])

# Reset gradients
x.grad.zero_()

# Example: More complex function
x = torch.tensor(2., requires_grad=True)
y = x ** 2 + 2 * x + 1
y.backward()
print(x.grad)  # dy/dx = 2x + 2 = 6

4. Neural Networks with nn.Module {#neural-networks}

Simple Neural Network

class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create model
model = SimpleNN(input_size=10, hidden_size=20, output_size=3)
print(model)

# Forward pass
input_data = torch.randn(5, 10)  # Batch of 5 samples
output = model(input_data)
print(output.shape)  # torch.Size([5, 3])

More Complex Network

class DeepNN(nn.Module):
    def __init__(self, num_classes=10):
        super(DeepNN, self).__init__()
        self.features = nn.Sequential(
            nn.Linear(784, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.classifier = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        x = self.features(x)
        x = self.classifier(x)
        return x

5. Training Loop {#training-loop}

Complete Training Example

# Dataset and DataLoader
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

# Create dummy data
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))

# Create dataset and dataloader
dataset = CustomDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model, Loss, Optimizer
model = DeepNN(num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    
    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(data)
        loss = criterion(output, target)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Statistics
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += pred.eq(target).sum().item()
    
    avg_loss = total_loss / len(dataloader)
    accuracy = 100. * correct / len(dataloader.dataset)
    return avg_loss, accuracy

# Train for multiple epochs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 10
for epoch in range(num_epochs):
    loss, accuracy = train_epoch(model, dataloader, criterion, optimizer, device)
    print(f'Epoch {epoch+1}: Loss = {loss:.4f}, Accuracy = {accuracy:.2f}%')

Validation and Testing

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    
    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            total_loss += criterion(output, target).item()
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
    
    avg_loss = total_loss / len(dataloader)
    accuracy = 100. * correct / len(dataloader.dataset)
    return avg_loss, accuracy

6. CNN Example {#cnn-example}

Convolutional Neural Network

class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CNN, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        
        # Pooling
        self.pool = nn.MaxPool2d(2, 2)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, num_classes)
        
        # Dropout
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        # Conv block 1
        x = self.pool(F.relu(self.conv1(x)))
        
        # Conv block 2
        x = self.pool(F.relu(self.conv2(x)))
        
        # Conv block 3
        x = self.pool(F.relu(self.conv3(x)))
        
        # Flatten
        x = x.view(-1, 128 * 4 * 4)
        
        # FC layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

# Example with CIFAR-10
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)

# Initialize and train
cnn_model = CNN(num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cnn_model.parameters(), lr=0.001, momentum=0.9)

7. RNN/LSTM Example {#rnn-example}

LSTM for Text Classification

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text shape: (batch_size, sequence_length)
        embedded = self.embedding(text)
        # embedded shape: (batch_size, sequence_length, embedding_dim)
        
        output, (hidden, cell) = self.lstm(embedded)
        # hidden shape: (n_layers, batch_size, hidden_dim)
        
        # Use last hidden state
        hidden = hidden[-1]  # Get last layer
        
        # Apply dropout and FC
        output = self.dropout(hidden)
        output = self.fc(output)
        
        return output

# Example usage
vocab_size = 10000
model = LSTMClassifier(vocab_size, 
                      embedding_dim=100,
                      hidden_dim=256,
                      output_dim=2,  # Binary classification
                      n_layers=2)

# Example input (batch of tokenized sentences)
input_text = torch.randint(0, vocab_size, (32, 50))  # 32 sentences, 50 tokens each
output = model(input_text)
print(output.shape)  # torch.Size([32, 2])

8. Transfer Learning {#transfer-learning}

Using Pretrained Models

import torchvision.models as models

# Load pretrained ResNet
resnet = models.resnet50(pretrained=True)

# Freeze all layers
for param in resnet.parameters():
    param.requires_grad = False

# Replace final layer for new task
num_features = resnet.fc.in_features
resnet.fc = nn.Linear(num_features, 10)  # 10 classes

# Only train the final layer
optimizer = optim.Adam(resnet.fc.parameters(), lr=0.001)

# Fine-tuning: Unfreeze some layers
for param in resnet.layer4.parameters():
    param.requires_grad = True

# Now optimize all unfrozen parameters
optimizer = optim.Adam(filter(lambda p: p.requires_grad, 
                             resnet.parameters()), lr=0.0001)

9. Interview Questions {#interview-questions}

Basic Questions

Q1: What is the difference between tensor.view() and tensor.reshape()?

# Answer: view() requires contiguous memory, reshape() doesn't
x = torch.randn(2, 3)
y = x.t()  # Transpose makes it non-contiguous
# z = y.view(6)  # This would fail
z = y.reshape(6)  # This works
z = y.contiguous().view(6)  # This also works

Q2: Explain requires_grad and how to freeze layers

# requires_grad=True enables gradient computation
x = torch.randn(3, requires_grad=True)

# Freeze layers:
for param in model.parameters():
    param.requires_grad = False

Q3: What's the difference between nn.Module and nn.functional?

# nn.Module: Stateful, has parameters
self.conv = nn.Conv2d(3, 64, 3)  # Has learnable weights

# nn.functional: Stateless, functional interface
output = F.conv2d(input, weight, bias)  # You provide weights

Intermediate Questions

Q4: Implement a custom loss function

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        return focal_loss.mean()

Q5: Implement gradient clipping

# Clip gradients to prevent exploding gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Or clip by value
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=1.0)

Q6: Explain and implement batch normalization

class ModelWithBN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(100, 50)
        self.bn1 = nn.BatchNorm1d(50)
        self.fc2 = nn.Linear(50, 10)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)  # Normalize before activation
        x = F.relu(x)
        x = self.fc2(x)
        return x

Advanced Questions

Q7: Implement a custom Dataset with data augmentation

class CustomImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

# With augmentation
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

Q8: Implement learning rate scheduling

# Different schedulers
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                 patience=10, factor=0.1)

# Custom scheduler
class WarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup_epochs, last_epoch=-1):
        self.warmup_epochs = warmup_epochs
        super().__init__(optimizer, last_epoch)
        
    def get_lr(self):
        if self.last_epoch < self.warmup_epochs:
            return [base_lr * (self.last_epoch + 1) / self.warmup_epochs 
                    for base_lr in self.base_lrs]
        return self.base_lrs

Q9: Implement attention mechanism

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
        
    def forward(self, hidden, encoder_outputs):
        # hidden: (batch_size, hidden_dim)
        # encoder_outputs: (batch_size, seq_len, hidden_dim)
        
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        
        # Repeat hidden state
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        
        # Concatenate
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        
        # Compute attention scores
        attention = self.v(energy).squeeze(2)
        attention_weights = F.softmax(attention, dim=1)
        
        # Apply attention
        context = attention_weights.unsqueeze(2) * encoder_outputs
        context = context.sum(dim=1)
        
        return context, attention_weights

Q10: Mixed precision training

from torch.cuda.amp import autocast, GradScaler

model = Model().cuda()
optimizer = optim.Adam(model.parameters())
scaler = GradScaler()

for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        
        # Mixed precision
        with autocast():
            output = model(batch['input'])
            loss = criterion(output, batch['target'])
        
        # Scale loss and backward
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

Common Debugging Tips

Shape mismatches: Always print tensor shapes

print(f"Input shape: {x.shape}")
x = self.layer(x)
print(f"After layer shape: {x.shape}")

Gradient flow issues:

# Check if gradients are flowing
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad_norm = {param.grad.norm()}")

Memory issues:

# Clear cache
torch.cuda.empty_cache()

# Check memory usage
print(f"Allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
print(f"Cached: {torch.cuda.memory_reserved()/1024**2:.2f} MB")

This guide covers the essential concepts of PyTorch from basics to advanced topics. Practice implementing these examples and understanding the concepts for interviews!

Artificial Intelligence Theory and Application

Search This Blog