본문 바로가기
Coloring (Additional Study)/Contest

DCC 한국음식 분류 모델

by 생각하는 이상훈 2023. 10. 29.
728x90

Model

한국음식 분류 모델을 구현해보았다.

ResNet의 pretrained weight를 이용하지 않기 때문에 성능이 많이 높지는 않지만 42개의 클래스에 대해서 분류하는 최고 성능을 낸 코드는 아래와 같다.

import torch
import numpy as np
import os
from torchvision import datasets, transforms, models
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import copy

def main():
    train_loader = os.path.join('train')
    valid_loader = os.path.join('kfood_val', 'val')

    print("train_cls_num: ", len(os.listdir(train_loader)))
    print("valid_cls_num: ", len(os.listdir(valid_loader)))

    mean = [0.5, 0.5, 0.5]
    std = [0.5, 0.5, 0.5]

    transform_train = transforms.Compose([
                    transforms.Resize((244,244)),
                    transforms.RandomResizedCrop((224,224)),
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomRotation(90),
                    transforms.ToTensor(),
                    transforms.Normalize(mean, std)
                    ])

    transform_valid = transforms.Compose([
                    transforms.Resize((244,244)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean, std)
                    ])

    dataset_train = datasets.ImageFolder(root=train_loader, transform=transform_train)
    dataset_train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=32, shuffle=True, num_workers=os.cpu_count())

    valid_train = datasets.ImageFolder(root=valid_loader, transform=transform_valid)
    valid_train_loader = torch.utils.data.DataLoader(valid_train, batch_size=32, shuffle=False, num_workers=os.cpu_count())

    resnet50 = models.resnet50(pretrained=False)

    # resnet50 설정
    resnet50.fc = nn.Linear(resnet50.fc.in_features, 42)
    device = torch.device('cuda:0')
    resnet50 = resnet50.to(device)

    # cross-entropy loss function 대신 nn.CrossEntropyLoss 사용
    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(resnet50.parameters(), lr=0.0001)

    def train(model, criterion, optimizer, num_epochs):
        best_model_wts = copy.deepcopy(model.state_dict())
        best_acc = 0.0
        
        for epoch in range(num_epochs):
            model.train()
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            running_loss = 0.0
            running_corrects = 0

            # tqdm을 데이터 로더에 적용하여 진행 막대를 표시
            for inputs, labels in tqdm(dataset_train_loader, desc="Training", leave=False):
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # zero the parameter gradients
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # statistics
                _, preds = torch.max(outputs, 1)
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataset_train)
            epoch_acc = running_corrects.double() / len(dataset_train)
            
            # Validation phase
            val_loss, val_accuracy = evaluate_model(model, criterion)
            
            print(f'Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
            print(f'Val Loss: {val_loss:.4f} Acc: {val_accuracy:.4f}')
            
            # Save the best model
            if val_accuracy > best_acc:
                best_acc = val_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())

        print(f'Best Validation Accuracy: {best_acc:.4f}')
        model.load_state_dict(best_model_wts)
        return model

    # For evaluating the model during training
    def evaluate_model(model, criterion):
        model.eval()
        total_loss = 0.0
        total_corrects = 0
        with torch.no_grad():
            for inputs, labels in valid_train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                _, preds = torch.max(outputs, 1)
                total_loss += loss.item() * inputs.size(0)
                total_corrects += torch.sum(preds == labels.data)
        accuracy = total_corrects.double() / len(valid_train)
        avg_loss = total_loss / len(valid_train)
        return avg_loss, accuracy.item()

    resnet50 = train(resnet50, criterion, optimizer, 150)

if __name__ == '__main__':
    main()

validation set에 대한 정확도를 올리는 것이 목표이므로 매 epochs 마다 validation set에 평가를 적용하여 최고 점수를 추출해낸 epoch에서의 학습 weight를 저장하여 최종 모델로 선정하는 코드를 적용하여 early stopping없이 좋은 성능을 찾는 것이 가능했다.

이론과 실제 모델의 성능은 다른 경우가 정말 많다는 것을 다시 느끼게 되었다. 내가 시도했던 내용을 몇가지 소개해보겠다.

Random_augmentation

def random_augmentation(img):
    # 가능한 모든 변환 리스트
    transforms_list = [
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.RandomResizedCrop(size=(224, 224), scale=(0.7, 1.0)),
        transforms.RandomAffine(degrees=0, translate=(0.2, 0.2)),
        transforms.RandomAffine(degrees=0, scale=(0.7, 1.3)),
        transforms.RandomAffine(degrees=(-70, 70)),
        transforms.RandomHorizontalFlip(p=1),
        transforms.RandomVerticalFlip(p=1),
        transforms.RandomRotation(15),
        transforms.RandomPerspective(distortion_scale=0.3, p=0.5),
        transforms.RandomResizedCrop(size=(224, 224), scale=(0.6, 0.9)),
        transforms.RandomGrayscale(p=1),
    ]
        
    # 변환 리스트에서 무작위로 3가지 변환 선택
    chosen_transforms = random.sample(transforms_list, 3)
    composed_transforms = transforms.Compose(chosen_transforms)
    return composed_transforms(img)

    # 데이터 변환 파이프라인에 random_augmentation 추가
    transform_train = transforms.Compose([
        random_augmentation,
        transforms.Resize((244,244)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    transform_valid = transforms.Compose([
                    transforms.Resize((244,244)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean, std)
                    ])

논문과 사람들의 평가로 Random augmentation 기법을 이용해 보았으나 미세하게 성능이 떨어졌다. class 수와 데이터 수가 적어서 보편적으로 좋은 성능을 보이는 기법들이 overfitting을 만드는 경우가 존재하는 것으로 예상됐다.

    mean = [0.58394545, 0.51383334, 0.42269564]
    std = [0.22759153, 0.23792826, 0.25431448]
    
    transform_train = transforms.Compose([
                    transforms.Resize((244,244)),
                    transforms.RandomResizedCrop((224,224)),
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomRotation(90),
                    transforms.ToTensor(),
                    transforms.Normalize(mean, std)
                    ])

    transform_valid = transforms.Compose([
                    transforms.Resize((244,244)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean, std)
                    ])

best score가 나온 모델을 찾기 전에 resnet18에서 잘 적용되었던 mean, std값을 이용했는데 overfitting이 되는 문제가 발생하였다.


728x90