update scripts
This commit is contained in:
parent
3734384b68
commit
c8dddf9cf9
@ -7,6 +7,7 @@ import torch.nn.functional as F
|
|||||||
import torchvision.datasets as dset
|
import torchvision.datasets as dset
|
||||||
import torch.backends.cudnn as cudnn
|
import torch.backends.cudnn as cudnn
|
||||||
import torchvision.transforms as transforms
|
import torchvision.transforms as transforms
|
||||||
|
import multiprocessing
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
|
lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
|
||||||
print ('lib-dir : {:}'.format(lib_dir))
|
print ('lib-dir : {:}'.format(lib_dir))
|
||||||
@ -29,7 +30,7 @@ parser.add_argument('--config_path', type=str, help='the training configur
|
|||||||
parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.')
|
parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.')
|
||||||
parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)')
|
parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)')
|
||||||
parser.add_argument('--manualSeed', type=int, help='manual seed')
|
parser.add_argument('--manualSeed', type=int, help='manual seed')
|
||||||
parser.add_argument('--threads', type=int, default=10, help='the number of threads')
|
parser.add_argument('--threads', type=int, default=4, help='the number of threads')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
assert torch.cuda.is_available(), 'torch.cuda is not available'
|
assert torch.cuda.is_available(), 'torch.cuda is not available'
|
||||||
@ -50,7 +51,7 @@ def main():
|
|||||||
if not os.path.isdir(args.save_path):
|
if not os.path.isdir(args.save_path):
|
||||||
os.makedirs(args.save_path)
|
os.makedirs(args.save_path)
|
||||||
log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w')
|
log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w')
|
||||||
print_log('save path : {}'.format(args.save_path), log)
|
print_log('save path : {:}'.format(args.save_path), log)
|
||||||
state = {k: v for k, v in args._get_kwargs()}
|
state = {k: v for k, v in args._get_kwargs()}
|
||||||
print_log(state, log)
|
print_log(state, log)
|
||||||
print_log("Random Seed: {}".format(args.manualSeed), log)
|
print_log("Random Seed: {}".format(args.manualSeed), log)
|
||||||
@ -59,6 +60,7 @@ def main():
|
|||||||
print_log("CUDA version : {}".format(torch.version.cuda), log)
|
print_log("CUDA version : {}".format(torch.version.cuda), log)
|
||||||
print_log("cuDNN version : {}".format(cudnn.version()), log)
|
print_log("cuDNN version : {}".format(cudnn.version()), log)
|
||||||
print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log)
|
print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log)
|
||||||
|
print_log("Num of CPUs : {}".format(multiprocessing.cpu_count()), log)
|
||||||
|
|
||||||
config = load_config( args.config_path )
|
config = load_config( args.config_path )
|
||||||
genotype = Networks[ args.arch ]
|
genotype = Networks[ args.arch ]
|
||||||
|
@ -3,7 +3,7 @@ from .utils import time_file_str, time_string
|
|||||||
from .utils import test_imagenet_data
|
from .utils import test_imagenet_data
|
||||||
from .utils import print_log
|
from .utils import print_log
|
||||||
from .evaluation_utils import obtain_accuracy
|
from .evaluation_utils import obtain_accuracy
|
||||||
from .draw_pts import draw_points
|
#from .draw_pts import draw_points
|
||||||
from .gpu_manager import GPUManager
|
from .gpu_manager import GPUManager
|
||||||
|
|
||||||
from .save_meta import Save_Meta
|
from .save_meta import Save_Meta
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
import os, sys, time
|
import os, sys, time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib
|
|
||||||
import random
|
import random
|
||||||
matplotlib.use('agg')
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
class AverageMeter(object):
|
class AverageMeter(object):
|
||||||
"""Computes and stores the average and current value"""
|
"""Computes and stores the average and current value"""
|
||||||
@ -53,6 +50,9 @@ class RecorderMeter(object):
|
|||||||
else: return self.epoch_accuracy[:self.current_epoch, 1].max()
|
else: return self.epoch_accuracy[:self.current_epoch, 1].max()
|
||||||
|
|
||||||
def plot_curve(self, save_path):
|
def plot_curve(self, save_path):
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('agg')
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
title = 'the accuracy/loss curve of train/val'
|
title = 'the accuracy/loss curve of train/val'
|
||||||
dpi = 100
|
dpi = 100
|
||||||
width, height = 1600, 1000
|
width, height = 1600, 1000
|
||||||
@ -97,7 +97,7 @@ class RecorderMeter(object):
|
|||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
|
|
||||||
def print_log(print_string, log):
|
def print_log(print_string, log):
|
||||||
print("{}".format(print_string))
|
print ("{:}".format(print_string))
|
||||||
if log is not None:
|
if log is not None:
|
||||||
log.write('{}\n'.format(print_string))
|
log.write('{}\n'.format(print_string))
|
||||||
log.flush()
|
log.flush()
|
||||||
|
1
output/.gitignore
vendored
1
output/.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
*
|
|
9
scripts-cluster/README.md
Normal file
9
scripts-cluster/README.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# Commands on Cluster
|
||||||
|
|
||||||
|
## RNN
|
||||||
|
```
|
||||||
|
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS"
|
||||||
|
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS"
|
||||||
|
```
|
||||||
|
|
||||||
|
## CNN
|
@ -1,6 +1,13 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
echo "CHECK-DATA-DIR START"
|
echo "CHECK-DATA-DIR START"
|
||||||
|
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
|
||||||
|
COMM_KM_Data COMM_km_2018 \
|
||||||
|
`pwd`/hadoop-data \
|
||||||
|
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
|
||||||
|
|
||||||
|
tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/
|
||||||
|
|
||||||
cifar_dir="./data/data/cifar.python"
|
cifar_dir="./data/data/cifar.python"
|
||||||
if [ -d ${cifar_dir} ]; then
|
if [ -d ${cifar_dir} ]; then
|
||||||
echo "Find cifar-dir: "${cifar_dir}
|
echo "Find cifar-dir: "${cifar_dir}
|
||||||
@ -10,20 +17,17 @@ else
|
|||||||
fi
|
fi
|
||||||
echo "CHECK-DATA-DIR DONE"
|
echo "CHECK-DATA-DIR DONE"
|
||||||
|
|
||||||
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
|
|
||||||
COMM_KM_Data COMM_km_2018 \
|
|
||||||
`pwd`/hadoop-data \
|
|
||||||
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
|
|
||||||
|
|
||||||
echo "PWD: " `pwd`
|
|
||||||
echo "files:: " `ls`
|
|
||||||
echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
|
|
||||||
|
|
||||||
# config python
|
# config python
|
||||||
PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
|
PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
|
||||||
wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
|
wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
|
||||||
tar xzf $PYTHON_ENV
|
tar xzf $PYTHON_ENV
|
||||||
|
|
||||||
alias python="./env/bin/python"
|
echo "JOB-PWD : " `pwd`
|
||||||
|
echo "JOB-files : " `ls`
|
||||||
|
echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
|
||||||
|
|
||||||
echo "Python: " `which python`
|
echo `./env/bin/python --version`
|
||||||
|
|
||||||
|
# real commands
|
||||||
|
bash ./scripts-rnn/train-WT2.sh GDAS
|
||||||
|
@ -18,14 +18,15 @@ QUEUE=$1
|
|||||||
NAME=$2
|
NAME=$2
|
||||||
GPUs=$3
|
GPUs=$3
|
||||||
CMD=$4
|
CMD=$4
|
||||||
TIME=$(date +"%Y-%h-%d-%T")
|
TIME=$(date +"%Y-%h-%d--%T")
|
||||||
|
TIME="${TIME//:/-}"
|
||||||
|
|
||||||
JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
|
JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
|
||||||
|
echo "JOB-SCRIPT: " ${JOB_SCRIPT}
|
||||||
|
|
||||||
cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
|
cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
|
||||||
echo ${CMD} >> ${JOB_SCRIPT}
|
echo ${CMD} >> ${JOB_SCRIPT}
|
||||||
|
|
||||||
exit 1
|
|
||||||
HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"
|
HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"
|
||||||
|
|
||||||
|
|
||||||
@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \
|
|||||||
--gpu-pnode ${GPUs} \
|
--gpu-pnode ${GPUs} \
|
||||||
--time-limit 0 \
|
--time-limit 0 \
|
||||||
--job-script ${JOB_SCRIPT}
|
--job-script ${JOB_SCRIPT}
|
||||||
|
|
||||||
|
#--job-script ${FDIR}/job-script.sh
|
||||||
|
#echo "JOB-SCRIPT: " ${JOB_SCRIPT}
|
||||||
|
@ -7,8 +7,18 @@ fi
|
|||||||
|
|
||||||
arch=$1
|
arch=$1
|
||||||
SAVED=./output/NAS-RNN/Search-${arch}-PTB
|
SAVED=./output/NAS-RNN/Search-${arch}-PTB
|
||||||
|
PY_C="./env/bin/python"
|
||||||
|
|
||||||
python ./exps-rnn/train_rnn_base.py \
|
if [ ! -f ${PY_C} ]; then
|
||||||
|
echo "Local Run with Python: "`which python`
|
||||||
|
PY_C="python"
|
||||||
|
else
|
||||||
|
echo "Cluster Run with Python: "${PY_C}
|
||||||
|
fi
|
||||||
|
|
||||||
|
${PY_C} --version
|
||||||
|
|
||||||
|
${PY_C} ./exps-rnn/train_rnn_base.py \
|
||||||
--arch ${arch} \
|
--arch ${arch} \
|
||||||
--save_path ${SAVED} \
|
--save_path ${SAVED} \
|
||||||
--config_path ./configs/NAS-PTB-BASE.config \
|
--config_path ./configs/NAS-PTB-BASE.config \
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env sh
|
#!/bin/bash
|
||||||
if [ "$#" -ne 1 ] ;then
|
if [ "$#" -ne 1 ] ;then
|
||||||
echo "Input illegal number of parameters " $#
|
echo "Input illegal number of parameters " $#
|
||||||
echo "Need 1 parameters for the architectures"
|
echo "Need 1 parameters for the architectures"
|
||||||
@ -7,8 +7,18 @@ fi
|
|||||||
|
|
||||||
arch=$1
|
arch=$1
|
||||||
SAVED=./output/NAS-RNN/Search-${arch}-WT2
|
SAVED=./output/NAS-RNN/Search-${arch}-WT2
|
||||||
|
PY_C="./env/bin/python"
|
||||||
|
|
||||||
python ./exps-rnn/train_rnn_base.py \
|
if [ ! -f ${PY_C} ]; then
|
||||||
|
echo "Local Run with Python: "`which python`
|
||||||
|
PY_C="python"
|
||||||
|
else
|
||||||
|
echo "Cluster Run with Python: "${PY_C}
|
||||||
|
fi
|
||||||
|
|
||||||
|
${PY_C} --version
|
||||||
|
|
||||||
|
${PY_C} ./exps-rnn/train_rnn_base.py \
|
||||||
--arch ${arch} \
|
--arch ${arch} \
|
||||||
--save_path ${SAVED} \
|
--save_path ${SAVED} \
|
||||||
--config_path ./configs/NAS-WT2-BASE.config \
|
--config_path ./configs/NAS-WT2-BASE.config \
|
||||||
|
Loading…
Reference in New Issue
Block a user