update scripts

This commit is contained in:
Xuanyi Dong 2019-03-30 02:10:20 +08:00
parent 3734384b68
commit c8dddf9cf9
9 changed files with 61 additions and 23 deletions

View File

@ -7,6 +7,7 @@ import torch.nn.functional as F
import torchvision.datasets as dset import torchvision.datasets as dset
import torch.backends.cudnn as cudnn import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms import torchvision.transforms as transforms
import multiprocessing
from pathlib import Path from pathlib import Path
lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
print ('lib-dir : {:}'.format(lib_dir)) print ('lib-dir : {:}'.format(lib_dir))
@ -29,7 +30,7 @@ parser.add_argument('--config_path', type=str, help='the training configur
parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.') parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.')
parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)') parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)')
parser.add_argument('--manualSeed', type=int, help='manual seed') parser.add_argument('--manualSeed', type=int, help='manual seed')
parser.add_argument('--threads', type=int, default=10, help='the number of threads') parser.add_argument('--threads', type=int, default=4, help='the number of threads')
args = parser.parse_args() args = parser.parse_args()
assert torch.cuda.is_available(), 'torch.cuda is not available' assert torch.cuda.is_available(), 'torch.cuda is not available'
@ -50,7 +51,7 @@ def main():
if not os.path.isdir(args.save_path): if not os.path.isdir(args.save_path):
os.makedirs(args.save_path) os.makedirs(args.save_path)
log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w') log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w')
print_log('save path : {}'.format(args.save_path), log) print_log('save path : {:}'.format(args.save_path), log)
state = {k: v for k, v in args._get_kwargs()} state = {k: v for k, v in args._get_kwargs()}
print_log(state, log) print_log(state, log)
print_log("Random Seed: {}".format(args.manualSeed), log) print_log("Random Seed: {}".format(args.manualSeed), log)
@ -59,6 +60,7 @@ def main():
print_log("CUDA version : {}".format(torch.version.cuda), log) print_log("CUDA version : {}".format(torch.version.cuda), log)
print_log("cuDNN version : {}".format(cudnn.version()), log) print_log("cuDNN version : {}".format(cudnn.version()), log)
print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log) print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log)
print_log("Num of CPUs : {}".format(multiprocessing.cpu_count()), log)
config = load_config( args.config_path ) config = load_config( args.config_path )
genotype = Networks[ args.arch ] genotype = Networks[ args.arch ]

View File

@ -3,7 +3,7 @@ from .utils import time_file_str, time_string
from .utils import test_imagenet_data from .utils import test_imagenet_data
from .utils import print_log from .utils import print_log
from .evaluation_utils import obtain_accuracy from .evaluation_utils import obtain_accuracy
from .draw_pts import draw_points #from .draw_pts import draw_points
from .gpu_manager import GPUManager from .gpu_manager import GPUManager
from .save_meta import Save_Meta from .save_meta import Save_Meta

View File

@ -1,9 +1,6 @@
import os, sys, time import os, sys, time
import numpy as np import numpy as np
import matplotlib
import random import random
matplotlib.use('agg')
import matplotlib.pyplot as plt
class AverageMeter(object): class AverageMeter(object):
"""Computes and stores the average and current value""" """Computes and stores the average and current value"""
@ -53,6 +50,9 @@ class RecorderMeter(object):
else: return self.epoch_accuracy[:self.current_epoch, 1].max() else: return self.epoch_accuracy[:self.current_epoch, 1].max()
def plot_curve(self, save_path): def plot_curve(self, save_path):
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
title = 'the accuracy/loss curve of train/val' title = 'the accuracy/loss curve of train/val'
dpi = 100 dpi = 100
width, height = 1600, 1000 width, height = 1600, 1000
@ -97,7 +97,7 @@ class RecorderMeter(object):
plt.close(fig) plt.close(fig)
def print_log(print_string, log): def print_log(print_string, log):
print("{}".format(print_string)) print ("{:}".format(print_string))
if log is not None: if log is not None:
log.write('{}\n'.format(print_string)) log.write('{}\n'.format(print_string))
log.flush() log.flush()

1
output/.gitignore vendored
View File

@ -1 +0,0 @@
*

View File

@ -0,0 +1,9 @@
# Commands on Cluster
## RNN
```
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS"
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS"
```
## CNN

View File

@ -1,6 +1,13 @@
#!/bin/bash #!/bin/bash
# #
echo "CHECK-DATA-DIR START" echo "CHECK-DATA-DIR START"
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
COMM_KM_Data COMM_km_2018 \
`pwd`/hadoop-data \
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/
cifar_dir="./data/data/cifar.python" cifar_dir="./data/data/cifar.python"
if [ -d ${cifar_dir} ]; then if [ -d ${cifar_dir} ]; then
echo "Find cifar-dir: "${cifar_dir} echo "Find cifar-dir: "${cifar_dir}
@ -10,20 +17,17 @@ else
fi fi
echo "CHECK-DATA-DIR DONE" echo "CHECK-DATA-DIR DONE"
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
COMM_KM_Data COMM_km_2018 \
`pwd`/hadoop-data \
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
echo "PWD: " `pwd`
echo "files:: " `ls`
echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
# config python # config python
PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
tar xzf $PYTHON_ENV tar xzf $PYTHON_ENV
alias python="./env/bin/python" echo "JOB-PWD : " `pwd`
echo "JOB-files : " `ls`
echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
echo "Python: " `which python` echo `./env/bin/python --version`
# real commands
bash ./scripts-rnn/train-WT2.sh GDAS

View File

@ -18,14 +18,15 @@ QUEUE=$1
NAME=$2 NAME=$2
GPUs=$3 GPUs=$3
CMD=$4 CMD=$4
TIME=$(date +"%Y-%h-%d-%T") TIME=$(date +"%Y-%h-%d--%T")
TIME="${TIME//:/-}"
JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
echo "JOB-SCRIPT: " ${JOB_SCRIPT}
cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
echo ${CMD} >> ${JOB_SCRIPT} echo ${CMD} >> ${JOB_SCRIPT}
exit 1
HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"
@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \
--gpu-pnode ${GPUs} \ --gpu-pnode ${GPUs} \
--time-limit 0 \ --time-limit 0 \
--job-script ${JOB_SCRIPT} --job-script ${JOB_SCRIPT}
#--job-script ${FDIR}/job-script.sh
#echo "JOB-SCRIPT: " ${JOB_SCRIPT}

View File

@ -7,8 +7,18 @@ fi
arch=$1 arch=$1
SAVED=./output/NAS-RNN/Search-${arch}-PTB SAVED=./output/NAS-RNN/Search-${arch}-PTB
PY_C="./env/bin/python"
python ./exps-rnn/train_rnn_base.py \ if [ ! -f ${PY_C} ]; then
echo "Local Run with Python: "`which python`
PY_C="python"
else
echo "Cluster Run with Python: "${PY_C}
fi
${PY_C} --version
${PY_C} ./exps-rnn/train_rnn_base.py \
--arch ${arch} \ --arch ${arch} \
--save_path ${SAVED} \ --save_path ${SAVED} \
--config_path ./configs/NAS-PTB-BASE.config \ --config_path ./configs/NAS-PTB-BASE.config \

View File

@ -1,4 +1,4 @@
#!/usr/bin/env sh #!/bin/bash
if [ "$#" -ne 1 ] ;then if [ "$#" -ne 1 ] ;then
echo "Input illegal number of parameters " $# echo "Input illegal number of parameters " $#
echo "Need 1 parameters for the architectures" echo "Need 1 parameters for the architectures"
@ -7,8 +7,18 @@ fi
arch=$1 arch=$1
SAVED=./output/NAS-RNN/Search-${arch}-WT2 SAVED=./output/NAS-RNN/Search-${arch}-WT2
PY_C="./env/bin/python"
python ./exps-rnn/train_rnn_base.py \ if [ ! -f ${PY_C} ]; then
echo "Local Run with Python: "`which python`
PY_C="python"
else
echo "Cluster Run with Python: "${PY_C}
fi
${PY_C} --version
${PY_C} ./exps-rnn/train_rnn_base.py \
--arch ${arch} \ --arch ${arch} \
--save_path ${SAVED} \ --save_path ${SAVED} \
--config_path ./configs/NAS-WT2-BASE.config \ --config_path ./configs/NAS-WT2-BASE.config \