From c8dddf9cf98167afb40917092b578c2b12c7409c Mon Sep 17 00:00:00 2001 From: Xuanyi Dong <280835372@qq.com> Date: Sat, 30 Mar 2019 02:10:20 +0800 Subject: [PATCH] update scripts --- exps-rnn/train_rnn_base.py | 6 ++++-- lib/utils/__init__.py | 2 +- lib/utils/utils.py | 8 ++++---- output/.gitignore | 1 - scripts-cluster/README.md | 9 +++++++++ scripts-cluster/job-script.sh | 24 ++++++++++++++---------- scripts-cluster/submit.sh | 8 ++++++-- scripts-rnn/train-PTB.sh | 12 +++++++++++- scripts-rnn/train-WT2.sh | 14 ++++++++++++-- 9 files changed, 61 insertions(+), 23 deletions(-) delete mode 100644 output/.gitignore create mode 100644 scripts-cluster/README.md diff --git a/exps-rnn/train_rnn_base.py b/exps-rnn/train_rnn_base.py index 518b78f..ab84088 100644 --- a/exps-rnn/train_rnn_base.py +++ b/exps-rnn/train_rnn_base.py @@ -7,6 +7,7 @@ import torch.nn.functional as F import torchvision.datasets as dset import torch.backends.cudnn as cudnn import torchvision.transforms as transforms +import multiprocessing from pathlib import Path lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() print ('lib-dir : {:}'.format(lib_dir)) @@ -29,7 +30,7 @@ parser.add_argument('--config_path', type=str, help='the training configur parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.') parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)') parser.add_argument('--manualSeed', type=int, help='manual seed') -parser.add_argument('--threads', type=int, default=10, help='the number of threads') +parser.add_argument('--threads', type=int, default=4, help='the number of threads') args = parser.parse_args() assert torch.cuda.is_available(), 'torch.cuda is not available' @@ -50,7 +51,7 @@ def main(): if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w') - print_log('save path : {}'.format(args.save_path), log) + print_log('save path : {:}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed: {}".format(args.manualSeed), log) @@ -59,6 +60,7 @@ def main(): print_log("CUDA version : {}".format(torch.version.cuda), log) print_log("cuDNN version : {}".format(cudnn.version()), log) print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log) + print_log("Num of CPUs : {}".format(multiprocessing.cpu_count()), log) config = load_config( args.config_path ) genotype = Networks[ args.arch ] diff --git a/lib/utils/__init__.py b/lib/utils/__init__.py index 47eae7a..ca38ea9 100644 --- a/lib/utils/__init__.py +++ b/lib/utils/__init__.py @@ -3,7 +3,7 @@ from .utils import time_file_str, time_string from .utils import test_imagenet_data from .utils import print_log from .evaluation_utils import obtain_accuracy -from .draw_pts import draw_points +#from .draw_pts import draw_points from .gpu_manager import GPUManager from .save_meta import Save_Meta diff --git a/lib/utils/utils.py b/lib/utils/utils.py index 884f46f..494aa13 100644 --- a/lib/utils/utils.py +++ b/lib/utils/utils.py @@ -1,9 +1,6 @@ import os, sys, time import numpy as np -import matplotlib import random -matplotlib.use('agg') -import matplotlib.pyplot as plt class AverageMeter(object): """Computes and stores the average and current value""" @@ -53,6 +50,9 @@ class RecorderMeter(object): else: return self.epoch_accuracy[:self.current_epoch, 1].max() def plot_curve(self, save_path): + import matplotlib + matplotlib.use('agg') + import matplotlib.pyplot as plt title = 'the accuracy/loss curve of train/val' dpi = 100 width, height = 1600, 1000 @@ -97,7 +97,7 @@ class RecorderMeter(object): plt.close(fig) def print_log(print_string, log): - print("{}".format(print_string)) + print ("{:}".format(print_string)) if log is not None: log.write('{}\n'.format(print_string)) log.flush() diff --git a/output/.gitignore b/output/.gitignore deleted file mode 100644 index 72e8ffc..0000000 --- a/output/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* diff --git a/scripts-cluster/README.md b/scripts-cluster/README.md new file mode 100644 index 0000000..9c9d714 --- /dev/null +++ b/scripts-cluster/README.md @@ -0,0 +1,9 @@ +# Commands on Cluster + +## RNN +``` +bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS" +bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS" +``` + +## CNN diff --git a/scripts-cluster/job-script.sh b/scripts-cluster/job-script.sh index ffbce42..28b5b7b 100644 --- a/scripts-cluster/job-script.sh +++ b/scripts-cluster/job-script.sh @@ -1,6 +1,13 @@ #!/bin/bash # echo "CHECK-DATA-DIR START" +sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ + COMM_KM_Data COMM_km_2018 \ + `pwd`/hadoop-data \ + afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets + +tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/ + cifar_dir="./data/data/cifar.python" if [ -d ${cifar_dir} ]; then echo "Find cifar-dir: "${cifar_dir} @@ -10,20 +17,17 @@ else fi echo "CHECK-DATA-DIR DONE" -sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ - COMM_KM_Data COMM_km_2018 \ - `pwd`/hadoop-data \ - afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets - -echo "PWD: " `pwd` -echo "files:: " `ls` -echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} # config python PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 tar xzf $PYTHON_ENV -alias python="./env/bin/python" +echo "JOB-PWD : " `pwd` +echo "JOB-files : " `ls` +echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} -echo "Python: " `which python` +echo `./env/bin/python --version` + +# real commands +bash ./scripts-rnn/train-WT2.sh GDAS diff --git a/scripts-cluster/submit.sh b/scripts-cluster/submit.sh index c0e0bd0..1298d16 100644 --- a/scripts-cluster/submit.sh +++ b/scripts-cluster/submit.sh @@ -18,14 +18,15 @@ QUEUE=$1 NAME=$2 GPUs=$3 CMD=$4 -TIME=$(date +"%Y-%h-%d-%T") +TIME=$(date +"%Y-%h-%d--%T") +TIME="${TIME//:/-}" JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" +echo "JOB-SCRIPT: " ${JOB_SCRIPT} cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} echo ${CMD} >> ${JOB_SCRIPT} -exit 1 HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" @@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \ --gpu-pnode ${GPUs} \ --time-limit 0 \ --job-script ${JOB_SCRIPT} + +#--job-script ${FDIR}/job-script.sh +#echo "JOB-SCRIPT: " ${JOB_SCRIPT} diff --git a/scripts-rnn/train-PTB.sh b/scripts-rnn/train-PTB.sh index e667fd6..ff98115 100644 --- a/scripts-rnn/train-PTB.sh +++ b/scripts-rnn/train-PTB.sh @@ -7,8 +7,18 @@ fi arch=$1 SAVED=./output/NAS-RNN/Search-${arch}-PTB +PY_C="./env/bin/python" -python ./exps-rnn/train_rnn_base.py \ +if [ ! -f ${PY_C} ]; then + echo "Local Run with Python: "`which python` + PY_C="python" +else + echo "Cluster Run with Python: "${PY_C} +fi + +${PY_C} --version + +${PY_C} ./exps-rnn/train_rnn_base.py \ --arch ${arch} \ --save_path ${SAVED} \ --config_path ./configs/NAS-PTB-BASE.config \ diff --git a/scripts-rnn/train-WT2.sh b/scripts-rnn/train-WT2.sh index fd61800..8c11d7b 100644 --- a/scripts-rnn/train-WT2.sh +++ b/scripts-rnn/train-WT2.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env sh +#!/bin/bash if [ "$#" -ne 1 ] ;then echo "Input illegal number of parameters " $# echo "Need 1 parameters for the architectures" @@ -7,8 +7,18 @@ fi arch=$1 SAVED=./output/NAS-RNN/Search-${arch}-WT2 +PY_C="./env/bin/python" -python ./exps-rnn/train_rnn_base.py \ +if [ ! -f ${PY_C} ]; then + echo "Local Run with Python: "`which python` + PY_C="python" +else + echo "Cluster Run with Python: "${PY_C} +fi + +${PY_C} --version + +${PY_C} ./exps-rnn/train_rnn_base.py \ --arch ${arch} \ --save_path ${SAVED} \ --config_path ./configs/NAS-WT2-BASE.config \