update ImageNet training
This commit is contained in:
		| @@ -24,8 +24,8 @@ CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-cifar.sh GDAS_V1 cifar100 cut | |||||||
|  |  | ||||||
| Train the searched CNN on ImageNet | Train the searched CNN on ImageNet | ||||||
| ``` | ``` | ||||||
| CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 | CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 B128 -1 | ||||||
| CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 | CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 B128 -1 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| Evaluate a trained CNN model | Evaluate a trained CNN model | ||||||
|   | |||||||
							
								
								
									
										15
									
								
								configs/nas-imagenet-B128.config
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								configs/nas-imagenet-B128.config
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | |||||||
|  | { | ||||||
|  |   "type"      : ["str",   "steplr"], | ||||||
|  |   "batch_size": ["int",   128], | ||||||
|  |   "epochs"    : ["int",   250], | ||||||
|  |   "decay_period": ["int",   1], | ||||||
|  |   "gamma"     : ["float", 0.97], | ||||||
|  |   "momentum"  : ["float", 0.9], | ||||||
|  |   "decay"     : ["float", 0.00003], | ||||||
|  |   "LR"        : ["float", 0.1], | ||||||
|  |   "label_smooth": ["float", 0.1], | ||||||
|  |   "auxiliary" : ["bool", 1], | ||||||
|  |   "auxiliary_weight" : ["float", 0.4], | ||||||
|  |   "grad_clip" : ["float", 5], | ||||||
|  |   "drop_path_prob" : ["float", 0] | ||||||
|  | } | ||||||
							
								
								
									
										15
									
								
								configs/nas-imagenet-B256.config
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								configs/nas-imagenet-B256.config
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | |||||||
|  | { | ||||||
|  |   "type"      : ["str",   "steplr"], | ||||||
|  |   "batch_size": ["int",   256], | ||||||
|  |   "epochs"    : ["int",   250], | ||||||
|  |   "decay_period": ["int",   1], | ||||||
|  |   "gamma"     : ["float", 0.97], | ||||||
|  |   "momentum"  : ["float", 0.9], | ||||||
|  |   "decay"     : ["float", 0.00003], | ||||||
|  |   "LR"        : ["float", 0.1], | ||||||
|  |   "label_smooth": ["float", 0.1], | ||||||
|  |   "auxiliary" : ["bool", 1], | ||||||
|  |   "auxiliary_weight" : ["float", 0.4], | ||||||
|  |   "grad_clip" : ["float", 5], | ||||||
|  |   "drop_path_prob" : ["float", 0] | ||||||
|  | } | ||||||
| @@ -42,7 +42,7 @@ else                                       : print('Find CUDA_VISIBLE_DEVICES={: | |||||||
| assert torch.cuda.is_available(), 'torch.cuda is not available' | assert torch.cuda.is_available(), 'torch.cuda is not available' | ||||||
|  |  | ||||||
|  |  | ||||||
| if args.manualSeed is None: | if args.manualSeed is None or args.manualSeed < 0: | ||||||
|   args.manualSeed = random.randint(1, 10000) |   args.manualSeed = random.randint(1, 10000) | ||||||
| random.seed(args.manualSeed) | random.seed(args.manualSeed) | ||||||
| cudnn.benchmark = True | cudnn.benchmark = True | ||||||
| @@ -54,10 +54,10 @@ torch.cuda.manual_seed_all(args.manualSeed) | |||||||
| def main(): | def main(): | ||||||
|  |  | ||||||
|   # Init logger |   # Init logger | ||||||
|   args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) |   #args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) | ||||||
|   if not os.path.isdir(args.save_path): |   if not os.path.isdir(args.save_path): | ||||||
|     os.makedirs(args.save_path) |     os.makedirs(args.save_path) | ||||||
|   log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') |   log = open(os.path.join(args.save_path, 'seed-{:}-log.txt'.format(args.manualSeed)), 'w') | ||||||
|   print_log('Save Path      : {:}'.format(args.save_path), log) |   print_log('Save Path      : {:}'.format(args.save_path), log) | ||||||
|   state = {k: v for k, v in args._get_kwargs()} |   state = {k: v for k, v in args._get_kwargs()} | ||||||
|   print_log(state, log) |   print_log(state, log) | ||||||
|   | |||||||
| @@ -59,8 +59,8 @@ def main_procedure(config, dataset, data_path, args, genotype, init_channels, la | |||||||
|     raise ValueError('Can not find the schedular type : {:}'.format(config.type)) |     raise ValueError('Can not find the schedular type : {:}'.format(config.type)) | ||||||
|  |  | ||||||
|  |  | ||||||
|   checkpoint_path = os.path.join(args.save_path, 'checkpoint-{:}-model.pth'.format(dataset)) |   checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-model.pth'.format(args.manualSeed, dataset)) | ||||||
|   checkpoint_best = os.path.join(args.save_path, 'checkpoint-{:}-best.pth'.format(dataset)) |   checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-best.pth'.format(args.manualSeed, dataset)) | ||||||
|   if pure_evaluate: |   if pure_evaluate: | ||||||
|     print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) |     print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) | ||||||
|     basemodel.load_state_dict( pure_evaluate ) |     basemodel.load_state_dict( pure_evaluate ) | ||||||
|   | |||||||
| @@ -81,8 +81,8 @@ def main_procedure_imagenet(config, data_path, args, genotype, init_channels, la | |||||||
|     raise ValueError('Can not find the schedular type : {:}'.format(config.type)) |     raise ValueError('Can not find the schedular type : {:}'.format(config.type)) | ||||||
|  |  | ||||||
|  |  | ||||||
|   checkpoint_path = os.path.join(args.save_path, 'checkpoint-imagenet-model.pth') |   checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-model.pth'.format(args.manualSeed)) | ||||||
|   checkpoint_best = os.path.join(args.save_path, 'checkpoint-imagenet-best.pth') |   checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-best.pth'.format(args.manualSeed)) | ||||||
|  |  | ||||||
|   if pure_evaluate: |   if pure_evaluate: | ||||||
|     print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) |     print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) | ||||||
|   | |||||||
| @@ -1,15 +1,16 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
| # | # | ||||||
| echo "CHECK-DATA-DIR START" | echo "CHECK-DATA-DIR START" | ||||||
| #sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||||
| #    COMM_KM_Data COMM_km_2018 \ |     COMM_KM_Data COMM_km_2018 \ | ||||||
| #    `pwd`/hadoop-data \ |     `pwd`/hadoop-data \ | ||||||
| #    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets |     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||||
|  |  | ||||||
| export TORCH_HOME="./data/data/" | export TORCH_HOME="./data/data/" | ||||||
| wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME} | #wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME} | ||||||
| tar xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME} | #tar -xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME} | ||||||
| rm ${TORCH_HOME}/cifar.python.tar | tar -xf ./hadoop-data/cifar.python.tar -C ${TORCH_HOME} | ||||||
|  | #rm ${TORCH_HOME}/cifar.python.tar | ||||||
| #tar xvf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME} | #tar xvf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME} | ||||||
|  |  | ||||||
| cifar_dir="${TORCH_HOME}/cifar.python" | cifar_dir="${TORCH_HOME}/cifar.python" | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| #!/usr/bin/env sh | #!/usr/bin/env sh | ||||||
| if [ "$#" -ne 3 ] ;then | if [ "$#" -ne 5 ] ;then | ||||||
|   echo "Input illegal number of parameters " $# |   echo "Input illegal number of parameters " $# | ||||||
|   echo "Need 3 parameters for the architecture, and the channel and the layers" |   echo "Need 5 parameters for the architecture, and the channel, and the layers, and the batch-size, and the seed" | ||||||
|   exit 1                |   exit 1                | ||||||
| fi  | fi  | ||||||
| if [ "$TORCH_HOME" = "" ]; then | if [ "$TORCH_HOME" = "" ]; then | ||||||
| @@ -15,7 +15,9 @@ arch=$1 | |||||||
| dataset=imagenet | dataset=imagenet | ||||||
| channels=$2 | channels=$2 | ||||||
| layers=$3 | layers=$3 | ||||||
| SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-E250 | BATCH=$4 | ||||||
|  | seed=$5 | ||||||
|  | SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-${BATCH}-E250 | ||||||
|  |  | ||||||
| PY_C="./env/bin/python" | PY_C="./env/bin/python" | ||||||
| #PY_C="$CONDA_PYTHON_EXE" | #PY_C="$CONDA_PYTHON_EXE" | ||||||
| @@ -27,8 +29,8 @@ else | |||||||
|   echo "Cluster Run with Python: "${PY_C} |   echo "Cluster Run with Python: "${PY_C} | ||||||
|   echo "Unzip ILSVRC2012" |   echo "Unzip ILSVRC2012" | ||||||
|   tar --version |   tar --version | ||||||
|   #tar xf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME} |   tar -xf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME} | ||||||
|   commands="./data/data/get_imagenet.sh" |   #commands="./data/data/get_imagenet.sh" | ||||||
|   #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands} |   #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands} | ||||||
|   #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh |   #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh | ||||||
|   #bash ./data/data/get_imagenet.sh |   #bash ./data/data/get_imagenet.sh | ||||||
| @@ -42,16 +44,16 @@ else | |||||||
|   #  free -g |   #  free -g | ||||||
|   #done < "${commands}" |   #done < "${commands}" | ||||||
|   #wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME} |   #wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME} | ||||||
|   ${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands} |   #${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands} | ||||||
|   count=0 |   #count=0 | ||||||
|   while read -r line; do |   #while read -r line; do | ||||||
|     temp_file="./data/data/TEMP-${count}.sh" |   #  temp_file="./data/data/TEMP-${count}.sh" | ||||||
|     echo "${line}" > ${temp_file} |   #  echo "${line}" > ${temp_file} | ||||||
|     bash ${temp_file} |   #  bash ${temp_file} | ||||||
|     count=$((count+1)) |   #  count=$((count+1)) | ||||||
|    #${PY_C} ./data/ps_mem.py -p $$ |    #${PY_C} ./data/ps_mem.py -p $$ | ||||||
|   #  free -g |   #  free -g | ||||||
|   done < "${commands}" |   #done < "${commands}" | ||||||
|   #echo "Copy ILSVRC2012 done" |   #echo "Copy ILSVRC2012 done" | ||||||
|   #tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME} |   #tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME} | ||||||
|   #rm ${TORCH_HOME}/ILSVRC2012.tar |   #rm ${TORCH_HOME}/ILSVRC2012.tar | ||||||
| @@ -66,5 +68,6 @@ ${PY_C} ./exps-cnn/train_base.py \ | |||||||
| 	--save_path ${SAVED} \ | 	--save_path ${SAVED} \ | ||||||
| 	--grad_clip 5 \ | 	--grad_clip 5 \ | ||||||
| 	--init_channels ${channels} --layers ${layers} \ | 	--init_channels ${channels} --layers ${layers} \ | ||||||
| 	--model_config ./configs/nas-imagenet.config \ | 	--model_config ./configs/nas-imagenet-${BATCH}.config \ | ||||||
|  | 	--manualSeed ${seed} \ | ||||||
| 	--print_freq 200 --workers 20 | 	--print_freq 200 --workers 20 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user