Merge pull request #977 from MaxVanDijck/main

msaroufim · web-flow · commit 0cb38ebb1b6e · 2022-03-17T09:10:20.000-07:00
Change imagenet data script to include training data
diff --git a/imagenet/README.md b/imagenet/README.md
@@ -7,7 +7,7 @@ This implements training of popular model architectures, such as ResNet, AlexNet
 - Install PyTorch ([pytorch.org](http://pytorch.org))
 - `pip install -r requirements.txt`
 - Download the ImageNet dataset from http://www.image-net.org/
-    - Then, and move validation images to labeled subfolders, using [the following shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+    - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh)
 
 ## Training
 
diff --git a/imagenet/extract_ILSVRC.sh b/imagenet/extract_ILSVRC.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#
+# script to extract ImageNet dataset
+# ILSVRC2012_img_train.tar (about 138 GB)
+# ILSVRC2012_img_val.tar (about 6.3 GB)
+# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory
+#
+#  Adapted from:
+#  https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md
+#  https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
+# 
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Make imagnet directory
+#
+mkdir imagenet
+#
+# Extract the training data:
+#
+# Create train directory; move .tar file; change directory
+mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train
+# Extract training set; remove compressed file
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+#
+# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
+#
+# For each .tar file: 
+#   1. create directory with same name as .tar file
+#   2. extract and copy contents of .tar file into directory
+#   3. remove .tar file
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+#
+# This results in a training directory like so:
+#
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+# Change back to original directory
+cd ../..
+#
+# Extract the validation data and move images to subfolders:
+#
+# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file
+mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar
+# get script from soumith and run; this script creates all class directories and moves images into corresponding directories
+wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+#
+# This results in a validation directory like so:
+#
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Check total files after extract
+#
+#  $ find train/ -name "*.JPEG" | wc -l
+#  1281167
+#  $ find val/ -name "*.JPEG" | wc -l
+#  50000
+#
diff --git a/imagenet/main.py b/imagenet/main.py
@@ -25,8 +25,8 @@
     and callable(models.__dict__[name]))
 
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
-parser.add_argument('data', metavar='DIR',
-                    help='path to dataset')
+parser.add_argument('data', metavar='DIR', default='imagenet',
+                    help='path to dataset (default: imagenet)')
 parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                     choices=model_names,
                     help='model architecture: ' +