# PyTorch DataLoader Here are some code snippets using PyTorch DataLoader ## Custom image folders ```py # Create the dataset train_ds = dset.ImageFolder(root=args.data_root/train, transform=transform) # Create the dataloader train_dl = torch.utils.data.DataLoader(train_ds, # Create the dataset trest_ds = dset.ImageFolder(root=args.data_root/test, transform=transform) # Create the dataloader test_dl = torch.utils.data.DataLoader(test_ds, ``` ### dcgan_celeba.py ```py def load_data(args, show_images=False): """ Load the datasets """ # dset.CelebA(args.data_root, split='all', download=True) transform = transforms.Compose([ transforms.Resize(args.image_size), transforms.CenterCrop(args.image_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Create the dataset train_ds = dset.ImageFolder(root=args.data_root, transform=transform) # Create the dataloader train_dl = torch.utils.data.DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) args.experiment.log_dataset_info(train_ds) # Decide which device we want to run on device = torch.device("cuda:0" if (torch.cuda.is_available() and args.ngpu > 0) else "cpu") if show_images: # Plot some training images real_batch = next(iter(train_dl)) plt.figure(figsize=(8, 8)) plt.axis("off") plt.title("Training Images") plt.imshow(np.transpose( vutils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(), (1, 2, 0))) return train_dl, device ``` ### gan_mnist.py ```py def load_mnist_data(args): """ Load MNIST datasets We create a dataset object and a data loader that batches and shuffles post-transformation images for us. """ # Image processing # transform = transforms.Compose([ # transforms.ToTensor(), # transforms.Normalize(mean=(0.5, 0.5, 0.5), # 3 for RGB channels # std=(0.5, 0.5, 0.5))]) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.5], # 1 for greyscale channels std=[0.5])]) # DataLoader is the PyTorch module to combine the image # and its corresponding label in a package. # We can define a simple transformation that converts images to tensors # then applies a standard normalization procedure for easier training. transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5), (0.5)) ]) # Create a dataset object and a data loader that batches # and shuffles post-transformation images for us. trainset = datasets.MNIST(root=args.root, download=True, train=True, transform=transform) train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=True) return train_loader ``` ### pytorch_cifar.py ```py def load_data(): """ Load and normalize CIFAR10 dataset. We are creating data loaders which allows us to load data in batches such as when you have large data set it will not fit into memory for training. You can try different batch sizes by doubling (128, 256, 512) until your GPU/Memory fits it and processes it faster. When it starts to slow down you can decrease the batch size by one step. shuffle=True gives randomization to the data """ batch_size = 4 num_workers = 4 # number of sub-processes to use for data loading (parallelization) pin_memory = True # dataloader copies Tensors to pinned memory before returning them transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] ) train_ds = torchvision.datasets.CIFAR10( root="./data", train=True, download=True, transform=transform ) test_ds = torchvision.datasets.CIFAR10( root="./data", train=False, download=True, transform=transform ) train_loader = torch.utils.data.DataLoader( train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory ) test_loader = torch.utils.data.DataLoader( test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory ) classes = ( "plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck", ) return train_loader, test_loader, classes ``` ### pytorch_mnist.py ```py transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ) train_data = datasets.MNIST("../data", train=True, download=True, transform=transform) test_data = datasets.MNIST("../data", train=False, transform=transform) trainloader = torch.utils.data.DataLoader(train_data, **train_kwargs) testloader = torch.utils.data.DataLoader(test_data, **test_kwargs) ``` ### wgan_mnist.py ```py def load_data(args): """ Load and normalize the MNIST dataset. We are creating data loaders which allows us to load data in batches when we have large data set and it will not fit into memory for training. We can try different batch sizes by doubling (128, 256, 512) until the GPU/Memory fits it and processes it faster. When it starts to slow down you can decrease the batch size by one step. References: https://blog.paperspace.com/dataloaders-abstractions-pytorch/ """ device = "cuda" if torch.cuda.is_available() else "cpu" kwargs = {'num_workers': args.num_workers, 'pin_memory': args.pin_memory} if device == 'cuda' else {} # Configure dataloader # os.makedirs("../data", exist_ok=True) transform = transforms.Compose( [transforms.Resize(args.img_size), transforms.ToTensor(), transforms.Normalize([0.5],[0.5])] # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] ) train_dataset = datasets.MNIST( root=args.root, # root directory of dataset train=True, # create dataset from training set download=True, # download dataset from internet to root transform=transform ) test_ds = datasets.MNIST( root=args.root, train=False, download=True, transform=transform ) train_size = len(train_dataset) - args.val_size # train / validation split train_ds, val_ds = random_split(train_dataset, [train_size, args.val_size]) train_loader = DataLoader( train_ds, batch_size=args.batch_size, shuffle=True, # Gives randomization to the data **kwargs ) val_loader = DataLoader( val_ds, batch_size=args.batch_size, shuffle=False, **kwargs ) test_loader = DataLoader( test_ds, batch_size=args.batch_size_test, shuffle=False, **kwargs ) print("train: {} val: {} test: {}".format( len(train_loader.dataset), len(val_loader.dataset), len(test_loader.dataset), )) return train_loader, val_loader, test_loader ``` ## References [PyTorch Lightning: DataModules, Callbacks, TPU, and Loggers](https://krypticmouse.hashnode.dev/pytorch-lightning-datamodules-callbacks-tpu-and-loggers) [Training PyTorch on Cloud TPUs](https://ultrons.medium.com/training-pytorch-on-cloud-tpus-be0649e4efbc)