This commit is contained in:
Xuanyi Dong 2019-03-30 00:50:18 +08:00
parent a94a67b55d
commit 3734384b68
29 changed files with 93 additions and 1184 deletions

View File

@ -13,13 +13,6 @@ conda install pytorch torchvision cuda100 -c pytorch
## Algorithm ## Algorithm
Searching CNNs
```
bash ./scripts-cnn/search.sh 1 base cifar10
bash ./scripts-cnn/DMS-V-TrainV3.sh 1
bash ./scripts-cnn/search-acc-v2.sh 3 acc2
```
Train the searched CNN on CIFAR Train the searched CNN on CIFAR
``` ```
bash ./scripts-cnn/train-cifar.sh 0 GDAS_FG cifar10 cut bash ./scripts-cnn/train-cifar.sh 0 GDAS_FG cifar10 cut
@ -36,10 +29,10 @@ bash ./scripts-cnn/train-imagenet.sh 0 GDAS_V1 50 14
Train the searched RNN Train the searched RNN
``` ```
bash ./scripts-rnn/train-PTB.sh 0 DARTS_V1 CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh DARTS_V1
bash ./scripts-rnn/train-PTB.sh 0 DARTS_V2 CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh DARTS_V2
bash ./scripts-rnn/train-PTB.sh 0 GDAS CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh GDAS
bash ./scripts-rnn/train-WT2.sh 0 DARTS_V1 CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh DARTS_V1
bash ./scripts-rnn/train-WT2.sh 0 DARTS_V2 CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh DARTS_V2
bash ./scripts-rnn/train-WT2.sh 0 GDAS CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh GDAS
``` ```

View File

@ -1,30 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 3 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 3 parameters for the GPUs and the epochs and the cutout"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=acc2
cutout=$3
dataset=cifar10
epoch=$2
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.05 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--tau_max 10 --tau_min 4 \
--model_config ./configs/nas-cifar-cos.config \
--print_freq 100 --workers 8

View File

@ -1,57 +0,0 @@
# Neural-Architecture-Search
### Baseline
```
bash ./scripts-nas/search.sh 1 base cifar10
bash ./scripts-nas/search.sh 1 share
bash ./scripts-nas/batch-base-search.sh 1
bash ./scripts-nas/batch-base-model.sh 1
```
### Meta
```
bash ./scripts-nas/meta-search.sh 0 meta 20 5
```
### Acceleration
```
bash ./scripts-nas/search-acc-v2.sh 3 acc2
bash ./scripts-nas/DMS-V-Train.sh 0
bash ./scripts-nas/search-acc-simple.sh 3 NetworkV2
```
### Base Model Training
```
bash ./scripts-nas/train-model.sh 3 AmoebaNet
bash ./scripts-nas/train-model.sh 3 NASNet
bash ./scripts-nas/train-model.sh 3 DARTS_V1
bash ./scripts-nas/train-model-simple.sh 3 AmoebaNet
bash ./scripts-nas/train-imagenet.sh 3 DARTS_V2 50 14
bash scripts-nas/TRAIN-BASE.sh 0 PNASNet cifar10 nocut 48 11
bash scripts-nas/TRAIN-BASE.sh 0 AmoebaNet cifar10 nocut 36 20
bash scripts-nas/TRAIN-BASE.sh 0 NASNet cifar10 nocut 33 20
bash scripts-nas/TRAIN-BASE.sh 0 DMS_F1 cifar10 nocut 36 20
bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut 36 20
bash scripts-nas/TRAIN-BASE.sh 0 GDAS_CC cifar10 nocut 36 20
bash scripts-nas/train-imagenet.sh 3 DMS_F1 52 14
bash scripts-nas/train-imagenet.sh 3 DMS_V1 50 14
bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut 36 20
```
### Visualization
```
python ./exps-nas/vis-arch.py --checkpoint --save_dir
python ./exps-nas/cvpr-vis.py --save_dir ./snapshots/NAS-VIS/
```
### Test datasets
```
cd ./lib/datasets/
python test_NLP.py
```

View File

@ -1,30 +0,0 @@
#!/usr/bin/env sh
# bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut init-channel layers
if [ "$#" -ne 6 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 6 parameters for the GPUs, the architecture, the dataset, the config, the initial channel, and the number of layers"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
dataset=$3
config=$4
C=$5
N=$6
SAVED=./snapshots/NAS/${arch}-${C}-${N}-${dataset}-${config}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/train_base.py \
--data_path $TORCH_HOME/cifar.python \
--dataset ${dataset} --arch ${arch} \
--save_path ${SAVED} \
--grad_clip 5 \
--init_channels ${C} --layers ${N} \
--model_config ./configs/nas-cifar-cos-${config}.config \
--print_freq 100 --workers 8

View File

@ -1,23 +0,0 @@
#!/usr/bin/env sh
set -e
if [ "$#" -ne 1 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 1 parameters for the GPUs"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
bash ./scripts-nas/train-model.sh ${gpus} AmoebaNet 0
bash ./scripts-nas/train-model.sh ${gpus} NASNet 0
bash ./scripts-nas/train-model.sh ${gpus} DARTS_V1 0
bash ./scripts-nas/train-model.sh ${gpus} DARTS_V2 0

View File

@ -1,19 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 1 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 1 parameters for the GPUs"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
Times="1 2 3"
for time in ${Times}; do
bash ./scripts-nas/search.sh ${gpus}
done

View File

@ -1,30 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 4 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 4 parameters for the GPUs and the network and N-way and K-shot"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
n_way=$3
k_shot=$4
cutout=16
epoch=60
SAVED=./snapshots/NAS/Meta-Search-${arch}-N${n_way}-K${k_shot}-cut${cutout}-${epoch}
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/meta_search.py \
--data_path $TORCH_HOME/tiered-imagenet \
--arch ${arch} --n_way ${n_way} --k_shot ${k_shot} \
--save_path ${SAVED} \
--learning_rate_max 0.001 --learning_rate_min 0.0001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos-cut.config \
--print_freq 200 --workers 16

View File

@ -1,29 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPUs and the network"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=cifar10
epoch=100
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E100
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos-simple.config \
--print_freq 100 --workers 8

View File

@ -1,29 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPUs and the network"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=cifar10
epoch=150
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos.config \
--print_freq 100 --workers 8

View File

@ -1,29 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPUs and the network"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=cifar10
epoch=200
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos.config \
--print_freq 100 --workers 8

View File

@ -1,29 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPUs and the network"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=cifar10
epoch=300
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos.config \
--print_freq 100 --workers 8

View File

@ -1,29 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPUs and the network"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=cifar10
epoch=50
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos.config \
--print_freq 100 --workers 8

View File

@ -1,29 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPUs and the network"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=cifar10
epoch=100
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos.config \
--print_freq 100 --workers 8

View File

@ -1,45 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 3 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 3 parameters for the GPUs and the network and the dataset"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=$3
epoch=50
SAVED=./snapshots/NAS/Search-${arch}-${dataset}-cut${cutout}-${epoch}
if [ "$dataset" == "cifar10" ] ;then
dataset_root=$TORCH_HOME/cifar.python
print_freq=100
elif [ "$dataset" == "cifar100" ] ;then
dataset_root=$TORCH_HOME/cifar.python
print_freq=100
elif [ "$dataset" == "tiered" ] ;then
dataset_root=$TORCH_HOME/tiered-imagenet
print_freq=500
else
echo 'invalid dataset-name :'${dataset}
exit 1
fi
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/train_search.py \
--data_path ${dataset_root} \
--arch ${arch} \
--dataset ${dataset} --batch_size 64 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--manualSeed 3858 \
--model_config ./configs/nas-cifar-cos.config \
--print_freq ${print_freq} --workers 8

View File

@ -1,9 +0,0 @@
#!/usr/bin/env sh
seeds="seed-8167 seed-908 seed-9242"
for seed in ${seeds}; do
python ./exps-nas/vis-arch.py --checkpoint ./snapshots/NAS/Search-cifar10-cut16-100/${seed}/checkpoint-search.pth \
--save_dir ./snapshots/NAS-VIS/Search-cut16-100/${seed}
done

View File

@ -1,313 +0,0 @@
# DARTS First Order, Refer to https://github.com/quark0/darts
import os, sys, time, glob, random, argparse
import numpy as np
from copy import deepcopy
import torch
import torch.nn.functional as F
import torchvision.datasets as dset
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
from pathlib import Path
lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
if str(lib_dir) not in sys.path: sys.path.insert(0, str(lib_dir))
from utils import AverageMeter, time_string, convert_secs2time
from utils import print_log, obtain_accuracy
from utils import Cutout, count_parameters_in_MB
from datasets import TieredImageNet
from nas import return_alphas_str, Network, NetworkV1, NetworkF1
from train_utils import main_procedure
from scheduler import load_config
Networks = {'base': Network, 'share': NetworkV1, 'fix': NetworkF1}
parser = argparse.ArgumentParser("CNN")
parser.add_argument('--data_path', type=str, help='Path to dataset')
parser.add_argument('--dataset', type=str, choices=['cifar10', 'cifar100', 'tiered'], help='Choose between Cifar10/100 and TieredImageNet.')
parser.add_argument('--arch', type=str, choices=Networks.keys(), help='Choose networks.')
parser.add_argument('--batch_size', type=int, help='the batch size')
parser.add_argument('--learning_rate_max', type=float, help='initial learning rate')
parser.add_argument('--learning_rate_min', type=float, help='minimum learning rate')
parser.add_argument('--momentum', type=float, help='momentum')
parser.add_argument('--weight_decay', type=float, help='weight decay')
parser.add_argument('--epochs', type=int, help='num of training epochs')
# architecture leraning rate
parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding')
parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding')
#
parser.add_argument('--init_channels', type=int, help='num of init channels')
parser.add_argument('--layers', type=int, help='total number of layers')
#
parser.add_argument('--cutout', type=int, help='cutout length, negative means no cutout')
parser.add_argument('--grad_clip', type=float, help='gradient clipping')
parser.add_argument('--model_config', type=str , help='the model configuration')
# resume
parser.add_argument('--resume', type=str , help='the resume path')
parser.add_argument('--only_base',action='store_true', default=False, help='only train the searched model')
# split data
parser.add_argument('--validate', action='store_true', default=False, help='split train-data int train/val or not')
parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data')
# log
parser.add_argument('--workers', type=int, default=2, help='number of data loading workers (default: 2)')
parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.')
parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)')
parser.add_argument('--manualSeed', type=int, help='manual seed')
args = parser.parse_args()
assert torch.cuda.is_available(), 'torch.cuda is not available'
if args.manualSeed is None:
args.manualSeed = random.randint(1, 10000)
random.seed(args.manualSeed)
cudnn.benchmark = True
cudnn.enabled = True
torch.manual_seed(args.manualSeed)
torch.cuda.manual_seed_all(args.manualSeed)
def main():
# Init logger
args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed))
if not os.path.isdir(args.save_path):
os.makedirs(args.save_path)
log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w')
print_log('save path : {}'.format(args.save_path), log)
state = {k: v for k, v in args._get_kwargs()}
print_log(state, log)
print_log("Random Seed: {}".format(args.manualSeed), log)
print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log)
print_log("Torch version : {}".format(torch.__version__), log)
print_log("CUDA version : {}".format(torch.version.cuda), log)
print_log("cuDNN version : {}".format(cudnn.version()), log)
print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log)
args.dataset = args.dataset.lower()
# Mean + Std
if args.dataset == 'cifar10':
mean = [x / 255 for x in [125.3, 123.0, 113.9]]
std = [x / 255 for x in [63.0, 62.1, 66.7]]
elif args.dataset == 'cifar100':
mean = [x / 255 for x in [129.3, 124.1, 112.4]]
std = [x / 255 for x in [68.2, 65.4, 70.4]]
elif args.dataset == 'tiered':
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
else:
raise TypeError("Unknow dataset : {:}".format(args.dataset))
# Data Argumentation
if args.dataset == 'cifar10' or args.dataset == 'cifar100':
lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(),
transforms.Normalize(mean, std)]
if args.cutout > 0 : lists += [Cutout(args.cutout)]
train_transform = transforms.Compose(lists)
test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)])
elif args.dataset == 'tiered':
lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(80, padding=4), transforms.ToTensor(), transforms.Normalize(mean, std)]
if args.cutout > 0 : lists += [Cutout(args.cutout)]
train_transform = transforms.Compose(lists)
test_transform = transforms.Compose([transforms.CenterCrop(80), transforms.ToTensor(), transforms.Normalize(mean, std)])
else:
raise TypeError("Unknow dataset : {:}".format(args.dataset))
# Datasets
if args.dataset == 'cifar10':
train_data = dset.CIFAR10(args.data_path, train= True, transform=train_transform, download=True)
test_data = dset.CIFAR10(args.data_path, train=False, transform=test_transform , download=True)
num_classes, head = 10, 'cifar'
elif args.dataset == 'cifar100':
train_data = dset.CIFAR100(args.data_path, train= True, transform=train_transform, download=True)
test_data = dset.CIFAR100(args.data_path, train=False, transform=test_transform , download=True)
num_classes, head = 100, 'cifar'
elif args.dataset == 'tiered':
train_data = TieredImageNet(args.data_path, 'train-val', train_transform)
test_data = None
num_classes, head = train_data.n_classes, 'imagenet'
else:
raise TypeError("Unknow dataset : {:}".format(args.dataset))
# Data Loader
if args.validate:
indices = list(range(len(train_data)))
split = int(args.train_portion * len(indices))
random.shuffle(indices)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size,
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
pin_memory=True, num_workers=args.workers)
test_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size,
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]),
pin_memory=True, num_workers=args.workers)
else:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
# network and criterion
criterion = torch.nn.CrossEntropyLoss().cuda()
basemodel = Networks[args.arch](args.init_channels, num_classes, args.layers, head=head)
model = torch.nn.DataParallel(basemodel).cuda()
print_log("Network : {:}".format(model), log)
print_log("Parameter size = {:.3f} MB".format(count_parameters_in_MB(basemodel.base_parameters())), log)
print_log("Train-transformation : {:}\nTest--transformation : {:}\nClass number : {:}".format(train_transform, test_transform, num_classes), log)
# optimizer and LR-scheduler
base_optimizer = torch.optim.SGD (basemodel.base_parameters(), args.learning_rate_max, momentum=args.momentum, weight_decay=args.weight_decay)
base_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(base_optimizer, float(args.epochs), eta_min=args.learning_rate_min)
arch_optimizer = torch.optim.Adam(basemodel.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)
# snapshot
checkpoint_path = os.path.join(args.save_path, 'checkpoint-search.pth')
if args.resume is not None and os.path.isfile(args.resume):
checkpoint = torch.load(args.resume)
start_epoch = checkpoint['epoch']
basemodel.load_state_dict( checkpoint['state_dict'] )
base_optimizer.load_state_dict( checkpoint['base_optimizer'] )
arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] )
base_scheduler.load_state_dict( checkpoint['base_scheduler'] )
genotypes = checkpoint['genotypes']
print_log('Load resume from {:} with start-epoch = {:}'.format(args.resume, start_epoch), log)
elif os.path.isfile(checkpoint_path):
checkpoint = torch.load(checkpoint_path)
start_epoch = checkpoint['epoch']
basemodel.load_state_dict( checkpoint['state_dict'] )
base_optimizer.load_state_dict( checkpoint['base_optimizer'] )
arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] )
base_scheduler.load_state_dict( checkpoint['base_scheduler'] )
genotypes = checkpoint['genotypes']
print_log('Load checkpoint from {:} with start-epoch = {:}'.format(checkpoint_path, start_epoch), log)
else:
start_epoch, genotypes = 0, {}
print_log('Train model-search from scratch.', log)
config = load_config(args.model_config)
if args.only_base:
print_log('---- Only Train the Searched Model ----', log)
main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log)
return
# Main loop
start_time, epoch_time, total_train_time = time.time(), AverageMeter(), 0
for epoch in range(start_epoch, args.epochs):
base_scheduler.step()
need_time = convert_secs2time(epoch_time.val * (args.epochs-epoch), True)
print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [LR={:6.4f} ~ {:6.4f}] [Batch={:d}]'.format(time_string(), epoch, args.epochs, need_time, min(base_scheduler.get_lr()), max(base_scheduler.get_lr()), args.batch_size), log)
genotype = basemodel.genotype()
print_log('genotype = {:}'.format(genotype), log)
print_log('{:03d}/{:03d} alphas :\n{:}'.format(epoch, args.epochs, return_alphas_str(basemodel)), log)
# training
train_acc1, train_acc5, train_obj, train_time \
= train(train_loader, test_loader, model, criterion, base_optimizer, arch_optimizer, epoch, log)
total_train_time += train_time
# validation
valid_acc1, valid_acc5, valid_obj = infer(test_loader, model, criterion, epoch, log)
print_log('Base-Search : {:03d}/{:03d} : Train-Acc={:.3f}, Test-Acc={:.3f}'.format(epoch, args.epochs, train_acc1, valid_acc1), log)
# save genotype
genotypes[epoch] = basemodel.genotype()
# save checkpoint
torch.save({'epoch' : epoch + 1,
'args' : deepcopy(args),
'state_dict': basemodel.state_dict(),
'genotypes' : genotypes,
'base_optimizer' : base_optimizer.state_dict(),
'arch_optimizer' : arch_optimizer.state_dict(),
'base_scheduler' : base_scheduler.state_dict()},
checkpoint_path)
print_log('----> Save into {:}'.format(checkpoint_path), log)
# measure elapsed time
epoch_time.update(time.time() - start_time)
start_time = time.time()
print_log('Finish with training time = {:}'.format( convert_secs2time(total_train_time, True) ), log)
# clear GPU cache
#torch.cuda.empty_cache()
#main_procedure(config, 'cifar10', os.environ['TORCH_HOME'] + '/cifar.python', args, basemodel.genotype(), 36, 20, log)
log.close()
def train(train_queue, valid_queue, model, criterion, base_optimizer, arch_optimizer, epoch, log):
data_time, batch_time = AverageMeter(), AverageMeter()
objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter()
model.train()
valid_iter = iter(valid_queue)
end = time.time()
for step, (inputs, targets) in enumerate(train_queue):
batch, C, H, W = inputs.size()
#inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True)
targets = targets.cuda(non_blocking=True)
data_time.update(time.time() - end)
# get a random minibatch from the search queue with replacement
try:
input_search, target_search = next(valid_iter)
except:
valid_iter = iter(valid_queue)
input_search, target_search = next(valid_iter)
target_search = target_search.cuda(non_blocking=True)
# update the architecture
arch_optimizer.zero_grad()
output_search = model(input_search)
arch_loss = criterion(output_search, target_search)
arch_loss.backward()
arch_optimizer.step()
# update the parameters
base_optimizer.zero_grad()
logits = model(inputs)
loss = criterion(logits, targets)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.module.base_parameters(), args.grad_clip)
base_optimizer.step()
prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5))
objs.update(loss.item() , batch)
top1.update(prec1.item(), batch)
top5.update(prec5.item(), batch)
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if step % args.print_freq == 0 or (step+1) == len(train_queue):
Sstr = ' TRAIN-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(train_queue))
Tstr = 'Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})'.format(batch_time=batch_time, data_time=data_time)
Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5)
print_log(Sstr + ' ' + Tstr + ' ' + Lstr, log)
return top1.avg, top5.avg, objs.avg, batch_time.sum
def infer(valid_queue, model, criterion, epoch, log):
objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter()
model.eval()
with torch.no_grad():
for step, (inputs, targets) in enumerate(valid_queue):
batch, C, H, W = inputs.size()
targets = targets.cuda(non_blocking=True)
logits = model(inputs)
loss = criterion(logits, targets)
prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5))
objs.update(loss.item() , batch)
top1.update(prec1.item(), batch)
top5.update(prec5.item(), batch)
if step % args.print_freq == 0 or (step+1) == len(valid_queue):
Sstr = ' VALID-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(valid_queue))
Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5)
print_log(Sstr + ' ' + Lstr, log)
return top1.avg, top5.avg, objs.avg
if __name__ == '__main__':
main()

View File

@ -1,310 +0,0 @@
import os, sys, time, glob, random, argparse
import numpy as np
from copy import deepcopy
import torch
import torch.nn.functional as F
import torchvision.datasets as dset
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
from pathlib import Path
lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
if str(lib_dir) not in sys.path: sys.path.insert(0, str(lib_dir))
from utils import AverageMeter, time_string, convert_secs2time
from utils import print_log, obtain_accuracy
from utils import Cutout, count_parameters_in_MB
from nas import Network, NetworkACC2, NetworkV3, NetworkV4, NetworkV5, NetworkFACC1
from nas import return_alphas_str
from train_utils import main_procedure
from scheduler import load_config
Networks = {'base': Network, 'acc2': NetworkACC2, 'facc1': NetworkFACC1, 'NetworkV3': NetworkV3, 'NetworkV4': NetworkV4, 'NetworkV5': NetworkV5}
parser = argparse.ArgumentParser("cifar")
parser.add_argument('--data_path', type=str, help='Path to dataset')
parser.add_argument('--dataset', type=str, choices=['cifar10', 'cifar100'], help='Choose between Cifar10/100 and ImageNet.')
parser.add_argument('--arch', type=str, choices=Networks.keys(), help='Choose networks.')
parser.add_argument('--batch_size', type=int, help='the batch size')
parser.add_argument('--learning_rate_max', type=float, help='initial learning rate')
parser.add_argument('--learning_rate_min', type=float, help='minimum learning rate')
parser.add_argument('--tau_max', type=float, help='initial tau')
parser.add_argument('--tau_min', type=float, help='minimum tau')
parser.add_argument('--momentum', type=float, help='momentum')
parser.add_argument('--weight_decay', type=float, help='weight decay')
parser.add_argument('--epochs', type=int, help='num of training epochs')
# architecture leraning rate
parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding')
parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding')
#
parser.add_argument('--init_channels', type=int, help='num of init channels')
parser.add_argument('--layers', type=int, help='total number of layers')
#
parser.add_argument('--cutout', type=int, help='cutout length, negative means no cutout')
parser.add_argument('--grad_clip', type=float, help='gradient clipping')
parser.add_argument('--model_config', type=str , help='the model configuration')
# resume
parser.add_argument('--resume', type=str , help='the resume path')
parser.add_argument('--only_base',action='store_true', default=False, help='only train the searched model')
# split data
parser.add_argument('--validate', action='store_true', default=False, help='split train-data int train/val or not')
parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data')
# log
parser.add_argument('--workers', type=int, default=2, help='number of data loading workers (default: 2)')
parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.')
parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)')
parser.add_argument('--manualSeed', type=int, help='manual seed')
args = parser.parse_args()
assert torch.cuda.is_available(), 'torch.cuda is not available'
if args.manualSeed is None:
args.manualSeed = random.randint(1, 10000)
random.seed(args.manualSeed)
cudnn.benchmark = True
cudnn.enabled = True
torch.manual_seed(args.manualSeed)
torch.cuda.manual_seed_all(args.manualSeed)
def main():
# Init logger
args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed))
if not os.path.isdir(args.save_path):
os.makedirs(args.save_path)
log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w')
print_log('save path : {}'.format(args.save_path), log)
state = {k: v for k, v in args._get_kwargs()}
print_log(state, log)
print_log("Random Seed: {}".format(args.manualSeed), log)
print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log)
print_log("Torch version : {}".format(torch.__version__), log)
print_log("CUDA version : {}".format(torch.version.cuda), log)
print_log("cuDNN version : {}".format(cudnn.version()), log)
print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log)
args.dataset = args.dataset.lower()
# Mean + Std
if args.dataset == 'cifar10':
mean = [x / 255 for x in [125.3, 123.0, 113.9]]
std = [x / 255 for x in [63.0, 62.1, 66.7]]
elif args.dataset == 'cifar100':
mean = [x / 255 for x in [129.3, 124.1, 112.4]]
std = [x / 255 for x in [68.2, 65.4, 70.4]]
else:
raise TypeError("Unknow dataset : {:}".format(args.dataset))
# Data Argumentation
if args.dataset == 'cifar10' or args.dataset == 'cifar100':
lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(),
transforms.Normalize(mean, std)]
if args.cutout > 0 : lists += [Cutout(args.cutout)]
train_transform = transforms.Compose(lists)
test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)])
else:
raise TypeError("Unknow dataset : {:}".format(args.dataset))
# Datasets
if args.dataset == 'cifar10':
train_data = dset.CIFAR10(args.data_path, train= True, transform=train_transform, download=True)
test_data = dset.CIFAR10(args.data_path, train=False, transform=test_transform , download=True)
num_classes = 10
elif args.dataset == 'cifar100':
train_data = dset.CIFAR100(args.data_path, train= True, transform=train_transform, download=True)
test_data = dset.CIFAR100(args.data_path, train=False, transform=test_transform , download=True)
num_classes = 100
else:
raise TypeError("Unknow dataset : {:}".format(args.dataset))
# Data Loader
if args.validate:
indices = list(range(len(train_data)))
split = int(args.train_portion * len(indices))
random.shuffle(indices)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size,
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
pin_memory=True, num_workers=args.workers)
test_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size,
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]),
pin_memory=True, num_workers=args.workers)
else:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
# network and criterion
criterion = torch.nn.CrossEntropyLoss().cuda()
basemodel = Networks[args.arch](args.init_channels, num_classes, args.layers)
model = torch.nn.DataParallel(basemodel).cuda()
print_log("Parameter size = {:.3f} MB".format(count_parameters_in_MB(basemodel.base_parameters())), log)
print_log("Train-transformation : {:}\nTest--transformation : {:}".format(train_transform, test_transform), log)
# optimizer and LR-scheduler
base_optimizer = torch.optim.SGD (basemodel.base_parameters(), args.learning_rate_max, momentum=args.momentum, weight_decay=args.weight_decay)
#base_optimizer = torch.optim.Adam(basemodel.base_parameters(), lr=args.learning_rate_max, betas=(0.5, 0.999), weight_decay=args.weight_decay)
base_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(base_optimizer, float(args.epochs), eta_min=args.learning_rate_min)
arch_optimizer = torch.optim.Adam(basemodel.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)
# snapshot
checkpoint_path = os.path.join(args.save_path, 'checkpoint-search.pth')
if args.resume is not None and os.path.isfile(args.resume):
checkpoint = torch.load(args.resume)
start_epoch = checkpoint['epoch']
basemodel.load_state_dict( checkpoint['state_dict'] )
base_optimizer.load_state_dict( checkpoint['base_optimizer'] )
arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] )
base_scheduler.load_state_dict( checkpoint['base_scheduler'] )
genotypes = checkpoint['genotypes']
print_log('Load resume from {:} with start-epoch = {:}'.format(args.resume, start_epoch), log)
elif os.path.isfile(checkpoint_path):
checkpoint = torch.load(checkpoint_path)
start_epoch = checkpoint['epoch']
basemodel.load_state_dict( checkpoint['state_dict'] )
base_optimizer.load_state_dict( checkpoint['base_optimizer'] )
arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] )
base_scheduler.load_state_dict( checkpoint['base_scheduler'] )
genotypes = checkpoint['genotypes']
print_log('Load checkpoint from {:} with start-epoch = {:}'.format(checkpoint_path, start_epoch), log)
else:
start_epoch, genotypes = 0, {}
print_log('Train model-search from scratch.', log)
config = load_config(args.model_config)
if args.only_base:
print_log('---- Only Train the Searched Model ----', log)
main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log)
return
# Main loop
start_time, epoch_time, total_train_time = time.time(), AverageMeter(), 0
for epoch in range(start_epoch, args.epochs):
base_scheduler.step()
basemodel.set_tau( args.tau_max - epoch*1.0/args.epochs*(args.tau_max-args.tau_min) )
#if epoch + 2 == args.epochs:
# torch.cuda.empty_cache()
# basemodel.set_gumbel(False)
need_time = convert_secs2time(epoch_time.val * (args.epochs-epoch), True)
print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [LR={:6.4f} ~ {:6.4f}] [Batch={:d}], tau={:}'.format(time_string(), epoch, args.epochs, need_time, min(base_scheduler.get_lr()), max(base_scheduler.get_lr()), args.batch_size, basemodel.get_tau()), log)
genotype = basemodel.genotype()
print_log('genotype = {:}'.format(genotype), log)
print_log('{:03d}/{:03d} alphas :\n{:}'.format(epoch, args.epochs, return_alphas_str(basemodel)), log)
# training
train_acc1, train_acc5, train_obj, train_time \
= train(train_loader, test_loader, model, criterion, base_optimizer, arch_optimizer, epoch, log)
total_train_time += train_time
# validation
valid_acc1, valid_acc5, valid_obj = infer(test_loader, model, criterion, epoch, log)
print_log('{:03d}/{:03d}, Train-Accuracy = {:.2f}, Test-Accuracy = {:.2f}'.format(epoch, args.epochs, train_acc1, valid_acc1), log)
# save genotype
genotypes[epoch] = basemodel.genotype()
# save checkpoint
torch.save({'epoch' : epoch + 1,
'args' : deepcopy(args),
'state_dict': basemodel.state_dict(),
'genotypes' : genotypes,
'base_optimizer' : base_optimizer.state_dict(),
'arch_optimizer' : arch_optimizer.state_dict(),
'base_scheduler' : base_scheduler.state_dict()},
checkpoint_path)
print_log('----> Save into {:}'.format(checkpoint_path), log)
# measure elapsed time
epoch_time.update(time.time() - start_time)
start_time = time.time()
print_log('Finish with training time = {:}'.format( convert_secs2time(total_train_time, True) ), log)
# clear GPU cache
#torch.cuda.empty_cache()
#main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log)
log.close()
def train(train_queue, valid_queue, model, criterion, base_optimizer, arch_optimizer, epoch, log):
data_time, batch_time = AverageMeter(), AverageMeter()
objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter()
model.train()
valid_iter = iter(valid_queue)
end = time.time()
for step, (inputs, targets) in enumerate(train_queue):
batch, C, H, W = inputs.size()
#inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True)
targets = targets.cuda(non_blocking=True)
# get a random minibatch from the search queue with replacement
try:
input_search, target_search = next(valid_iter)
except:
valid_iter = iter(valid_queue)
input_search, target_search = next(valid_iter)
target_search = target_search.cuda(non_blocking=True)
data_time.update(time.time() - end)
# update the architecture
arch_optimizer.zero_grad()
output_search = model(input_search)
arch_loss = criterion(output_search, target_search)
arch_loss.backward()
arch_optimizer.step()
# update the parameters
base_optimizer.zero_grad()
logits = model(inputs)
loss = criterion(logits, targets)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.module.base_parameters(), args.grad_clip)
base_optimizer.step()
prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5))
objs.update(loss.item() , batch)
top1.update(prec1.item(), batch)
top5.update(prec5.item(), batch)
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if step % args.print_freq == 0 or (step+1) == len(train_queue):
Sstr = ' TRAIN-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(train_queue))
Tstr = 'Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})'.format(batch_time=batch_time, data_time=data_time)
Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5)
print_log(Sstr + ' ' + Tstr + ' ' + Lstr, log)
return top1.avg, top5.avg, objs.avg, batch_time.sum
def infer(valid_queue, model, criterion, epoch, log):
objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter()
model.eval()
with torch.no_grad():
for step, (inputs, targets) in enumerate(valid_queue):
batch, C, H, W = inputs.size()
targets = targets.cuda(non_blocking=True)
logits = model(inputs)
loss = criterion(logits, targets)
prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5))
objs.update(loss.item() , batch)
top1.update(prec1.item(), batch)
top5.update(prec5.item(), batch)
if step % args.print_freq == 0 or (step+1) == len(valid_queue):
Sstr = ' VALID-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(valid_queue))
Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5)
print_log(Sstr + ' ' + Lstr, log)
return top1.avg, top5.avg, objs.avg
if __name__ == '__main__':
main()

View File

@ -1,7 +1,6 @@
import os, sys, time import os, sys, time
from copy import deepcopy from copy import deepcopy
import torch import torch
import torchvision.datasets as dset
import torchvision.transforms as transforms import torchvision.transforms as transforms

View File

@ -2,7 +2,6 @@ import os, sys, time
from copy import deepcopy from copy import deepcopy
import torch import torch
import torch.nn as nn import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms import torchvision.transforms as transforms

1
output/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*

View File

@ -0,0 +1,29 @@
#!/bin/bash
#
echo "CHECK-DATA-DIR START"
cifar_dir="./data/data/cifar.python"
if [ -d ${cifar_dir} ]; then
echo "Find cifar-dir: "${cifar_dir}
else
echo "Can not find cifar-dir: "${cifar_dir}
exit 1
fi
echo "CHECK-DATA-DIR DONE"
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
COMM_KM_Data COMM_km_2018 \
`pwd`/hadoop-data \
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
echo "PWD: " `pwd`
echo "files:: " `ls`
echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
# config python
PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
tar xzf $PYTHON_ENV
alias python="./env/bin/python"
echo "Python: " `which python`

44
scripts-cluster/submit.sh Normal file
View File

@ -0,0 +1,44 @@
#!/bin/bash
# bash ./scripts-cluster/submit.sh ${QUEUE} ${JOB-NAME} ${GPUs}
#find -name "._*" | xargs rm -rf
ODIR=$(pwd)
FDIR=$(cd $(dirname $0); pwd)
echo "Bash-Dir : "${ODIR}
echo "File-Dir : "${FDIR}
echo "File-Name: "${0}
if [ "$#" -ne 4 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 4 parameters for the queue-name, the job-name, and the number-of-GPUs"
exit 1
fi
find -name "__pycache__" | xargs rm -rf
QUEUE=$1
NAME=$2
GPUs=$3
CMD=$4
TIME=$(date +"%Y-%h-%d-%T")
JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
echo ${CMD} >> ${JOB_SCRIPT}
exit 1
HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"
${HGCP_CLIENT_BIN}/submit \
--hdfs afs://xingtian.afs.baidu.com:9902 \
--hdfs-user COMM_KM_Data \
--hdfs-passwd COMM_km_2018 \
--hdfs-path /user/COMM_KM_Data/dongxuanyi/logs \
--file-dir ./ \
--job-name ${NAME} \
--queue-name ${QUEUE} \
--num-nodes 1 \
--num-task-pernode 1 \
--gpu-pnode ${GPUs} \
--time-limit 0 \
--job-script ${JOB_SCRIPT}

1
scripts-cluster/tmps/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*

View File

@ -1,31 +0,0 @@
#!/usr/bin/env sh
# bash scripts-cnn/DMS-V-TrainV3.sh 1
if [ "$#" -ne 1 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 1 parameters for the GPUs and the epochs"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=acc2
cutout=0
dataset=cifar10
epoch=200
SAVED=./snapshots/NAS/ACC-V3-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/GDAS-Search.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.01 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--tau_max 10 --tau_min 1 \
--model_config ./configs/nas-cifar-cos-cut.config \
--print_freq 100 --workers 10

View File

@ -1,29 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPUs and the network"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=cifar10
epoch=200
SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/GDAS-Search.py \
--data_path $TORCH_HOME/cifar.python \
--arch ${arch} --dataset ${dataset} --batch_size 128 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos-cut.config \
--print_freq 100 --workers 10

View File

@ -1,44 +0,0 @@
#!/usr/bin/env sh
if [ "$#" -ne 3 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 3 parameters for the GPUs and the network and the dataset"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1
arch=$2
cutout=0
dataset=$3
epoch=50
SAVED=./snapshots/NAS/Search-${arch}-${dataset}-cut${cutout}-${epoch}
if [ "$dataset" == "cifar10" ] ;then
dataset_root=$TORCH_HOME/cifar.python
print_freq=100
elif [ "$dataset" == "cifar100" ] ;then
dataset_root=$TORCH_HOME/cifar.python
print_freq=100
elif [ "$dataset" == "tiered" ] ;then
dataset_root=$TORCH_HOME/tiered-imagenet
print_freq=500
else
echo 'invalid dataset-name :'${dataset}
exit 1
fi
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/DARTS-Search.py \
--data_path ${dataset_root} \
--arch ${arch} \
--dataset ${dataset} --batch_size 64 \
--save_path ${SAVED} \
--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \
--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \
--init_channels 16 --layers 8 \
--model_config ./configs/nas-cifar-cos-cut.config \
--print_freq ${print_freq} --workers 8

View File

@ -17,9 +17,10 @@ arch=$2
dataset=$3 dataset=$3
cutout=$4 cutout=$4
SAVED=./snapshots/NAS/${arch}-${dataset}-${cutout}-E600 SAVED=./snapshots/NAS/${arch}-${dataset}-${cutout}-E600
#--data_path $TORCH_HOME/cifar.python \
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/train_base.py \ CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/train_base.py \
--data_path $TORCH_HOME/cifar.python \ --data_path ./data/data/cifar.python \
--dataset ${dataset} --arch ${arch} \ --dataset ${dataset} --arch ${arch} \
--save_path ${SAVED} \ --save_path ${SAVED} \
--grad_clip 5 \ --grad_clip 5 \

View File

@ -1,21 +1,14 @@
#!/usr/bin/env sh #!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then if [ "$#" -ne 1 ] ;then
echo "Input illegal number of parameters " $# echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPU and the architecture" echo "Need 1 parameters for the GPU and the architecture"
exit 1 exit 1
fi fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1 arch=$1
arch=$2 SAVED=./output/NAS-RNN/Search-${arch}-PTB
SAVED=./snapshots/NAS-RNN/Search-${arch}-PTB
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-rnn/train_rnn_base.py \ python ./exps-rnn/train_rnn_base.py \
--arch ${arch} \ --arch ${arch} \
--save_path ${SAVED} \ --save_path ${SAVED} \
--config_path ./configs/NAS-PTB-BASE.config \ --config_path ./configs/NAS-PTB-BASE.config \

View File

@ -1,21 +1,14 @@
#!/usr/bin/env sh #!/usr/bin/env sh
if [ "$#" -ne 2 ] ;then if [ "$#" -ne 1 ] ;then
echo "Input illegal number of parameters " $# echo "Input illegal number of parameters " $#
echo "Need 2 parameters for the GPU and the architecture" echo "Need 1 parameters for the architectures"
exit 1 exit 1
fi fi
if [ "$TORCH_HOME" = "" ]; then
echo "Must set TORCH_HOME envoriment variable for data dir saving"
exit 1
else
echo "TORCH_HOME : $TORCH_HOME"
fi
gpus=$1 arch=$1
arch=$2 SAVED=./output/NAS-RNN/Search-${arch}-WT2
SAVED=./snapshots/NAS-RNN/Search-${arch}-WT2
CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/rnn/train_rnn_base.py \ python ./exps-rnn/train_rnn_base.py \
--arch ${arch} \ --arch ${arch} \
--save_path ${SAVED} \ --save_path ${SAVED} \
--config_path ./configs/NAS-WT2-BASE.config \ --config_path ./configs/NAS-WT2-BASE.config \