From c8dddf9cf98167afb40917092b578c2b12c7409c Mon Sep 17 00:00:00 2001
From: Xuanyi Dong <280835372@qq.com>
Date: Sat, 30 Mar 2019 02:10:20 +0800
Subject: [PATCH] update scripts

---
 exps-rnn/train_rnn_base.py    |  6 ++++--
 lib/utils/__init__.py         |  2 +-
 lib/utils/utils.py            |  8 ++++----
 output/.gitignore             |  1 -
 scripts-cluster/README.md     |  9 +++++++++
 scripts-cluster/job-script.sh | 24 ++++++++++++++----------
 scripts-cluster/submit.sh     |  8 ++++++--
 scripts-rnn/train-PTB.sh      | 12 +++++++++++-
 scripts-rnn/train-WT2.sh      | 14 ++++++++++++--
 9 files changed, 61 insertions(+), 23 deletions(-)
 delete mode 100644 output/.gitignore
 create mode 100644 scripts-cluster/README.md

diff --git a/exps-rnn/train_rnn_base.py b/exps-rnn/train_rnn_base.py
index 518b78f..ab84088 100644
--- a/exps-rnn/train_rnn_base.py
+++ b/exps-rnn/train_rnn_base.py
@@ -7,6 +7,7 @@ import torch.nn.functional as F
 import torchvision.datasets as dset
 import torch.backends.cudnn as cudnn
 import torchvision.transforms as transforms
+import multiprocessing
 from pathlib import Path
 lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
 print ('lib-dir : {:}'.format(lib_dir))
@@ -29,7 +30,7 @@ parser.add_argument('--config_path',       type=str, help='the training configur
 parser.add_argument('--save_path',         type=str, help='Folder to save checkpoints and log.')
 parser.add_argument('--print_freq',        type=int, help='print frequency (default: 200)')
 parser.add_argument('--manualSeed',        type=int, help='manual seed')
-parser.add_argument('--threads',           type=int, default=10, help='the number of threads')
+parser.add_argument('--threads',           type=int, default=4, help='the number of threads')
 args = parser.parse_args()
 
 assert torch.cuda.is_available(), 'torch.cuda is not available'
@@ -50,7 +51,7 @@ def main():
   if not os.path.isdir(args.save_path):
     os.makedirs(args.save_path)
   log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w')
-  print_log('save path : {}'.format(args.save_path), log)
+  print_log('save path : {:}'.format(args.save_path), log)
   state = {k: v for k, v in args._get_kwargs()}
   print_log(state, log)
   print_log("Random Seed: {}".format(args.manualSeed), log)
@@ -59,6 +60,7 @@ def main():
   print_log("CUDA   version : {}".format(torch.version.cuda), log)
   print_log("cuDNN  version : {}".format(cudnn.version()), log)
   print_log("Num of GPUs    : {}".format(torch.cuda.device_count()), log)
+  print_log("Num of CPUs    : {}".format(multiprocessing.cpu_count()), log)
 
   config = load_config( args.config_path )
   genotype = Networks[ args.arch ]
diff --git a/lib/utils/__init__.py b/lib/utils/__init__.py
index 47eae7a..ca38ea9 100644
--- a/lib/utils/__init__.py
+++ b/lib/utils/__init__.py
@@ -3,7 +3,7 @@ from .utils import time_file_str, time_string
 from .utils import test_imagenet_data
 from .utils import print_log
 from .evaluation_utils import obtain_accuracy
-from .draw_pts import draw_points
+#from .draw_pts import draw_points
 from .gpu_manager import GPUManager
 
 from .save_meta import Save_Meta
diff --git a/lib/utils/utils.py b/lib/utils/utils.py
index 884f46f..494aa13 100644
--- a/lib/utils/utils.py
+++ b/lib/utils/utils.py
@@ -1,9 +1,6 @@
 import os, sys, time
 import numpy as np
-import matplotlib
 import random
-matplotlib.use('agg')
-import matplotlib.pyplot as plt
 
 class AverageMeter(object):
   """Computes and stores the average and current value"""
@@ -53,6 +50,9 @@ class RecorderMeter(object):
     else:       return self.epoch_accuracy[:self.current_epoch, 1].max()
 
   def plot_curve(self, save_path):
+    import matplotlib
+    matplotlib.use('agg')
+    import matplotlib.pyplot as plt
     title = 'the accuracy/loss curve of train/val'
     dpi = 100 
     width, height = 1600, 1000
@@ -97,7 +97,7 @@ class RecorderMeter(object):
     plt.close(fig)
     
 def print_log(print_string, log):
-  print("{}".format(print_string))
+  print ("{:}".format(print_string))
   if log is not None:
     log.write('{}\n'.format(print_string))
     log.flush()
diff --git a/output/.gitignore b/output/.gitignore
deleted file mode 100644
index 72e8ffc..0000000
--- a/output/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*
diff --git a/scripts-cluster/README.md b/scripts-cluster/README.md
new file mode 100644
index 0000000..9c9d714
--- /dev/null
+++ b/scripts-cluster/README.md
@@ -0,0 +1,9 @@
+# Commands on Cluster
+
+## RNN
+```
+bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS"
+bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS"
+```
+
+## CNN
diff --git a/scripts-cluster/job-script.sh b/scripts-cluster/job-script.sh
index ffbce42..28b5b7b 100644
--- a/scripts-cluster/job-script.sh
+++ b/scripts-cluster/job-script.sh
@@ -1,6 +1,13 @@
 #!/bin/bash
 #
 echo "CHECK-DATA-DIR START"
+sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
+    COMM_KM_Data COMM_km_2018 \
+    `pwd`/hadoop-data \
+    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
+
+tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/
+
 cifar_dir="./data/data/cifar.python"
 if [ -d ${cifar_dir} ]; then
   echo "Find cifar-dir: "${cifar_dir}
@@ -10,20 +17,17 @@ else
 fi
 echo "CHECK-DATA-DIR DONE"
 
-sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
-    COMM_KM_Data COMM_km_2018 \
-    `pwd`/hadoop-data \
-    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
-
-echo "PWD: " `pwd`
-echo "files::  " `ls`
-echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
 
 # config python
 PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
 wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
 tar xzf $PYTHON_ENV
 
-alias python="./env/bin/python"
+echo "JOB-PWD   : " `pwd`
+echo "JOB-files :  " `ls`
+echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
 
-echo "Python:  " `which python`
+echo `./env/bin/python --version`
+
+# real commands
+bash ./scripts-rnn/train-WT2.sh GDAS
diff --git a/scripts-cluster/submit.sh b/scripts-cluster/submit.sh
index c0e0bd0..1298d16 100644
--- a/scripts-cluster/submit.sh
+++ b/scripts-cluster/submit.sh
@@ -18,14 +18,15 @@ QUEUE=$1
 NAME=$2
 GPUs=$3
 CMD=$4
-TIME=$(date +"%Y-%h-%d-%T")
+TIME=$(date +"%Y-%h-%d--%T")
+TIME="${TIME//:/-}"
 
 JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
+echo "JOB-SCRIPT: " ${JOB_SCRIPT}
 
 cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
 echo ${CMD}              >> ${JOB_SCRIPT}
 
-exit 1
 HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"
 
 
@@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \
     --gpu-pnode ${GPUs} \
     --time-limit 0 \
     --job-script ${JOB_SCRIPT}
+
+#--job-script ${FDIR}/job-script.sh
+#echo "JOB-SCRIPT: " ${JOB_SCRIPT}
diff --git a/scripts-rnn/train-PTB.sh b/scripts-rnn/train-PTB.sh
index e667fd6..ff98115 100644
--- a/scripts-rnn/train-PTB.sh
+++ b/scripts-rnn/train-PTB.sh
@@ -7,8 +7,18 @@ fi
 
 arch=$1
 SAVED=./output/NAS-RNN/Search-${arch}-PTB
+PY_C="./env/bin/python"
 
-python ./exps-rnn/train_rnn_base.py \
+if [ ! -f ${PY_C} ]; then
+  echo "Local Run with Python: "`which python`
+  PY_C="python"
+else
+  echo "Cluster Run with Python: "${PY_C}
+fi
+
+${PY_C} --version
+
+${PY_C} ./exps-rnn/train_rnn_base.py \
 	--arch ${arch} \
 	--save_path ${SAVED} \
 	--config_path ./configs/NAS-PTB-BASE.config \
diff --git a/scripts-rnn/train-WT2.sh b/scripts-rnn/train-WT2.sh
index fd61800..8c11d7b 100644
--- a/scripts-rnn/train-WT2.sh
+++ b/scripts-rnn/train-WT2.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env sh
+#!/bin/bash
 if [ "$#" -ne 1 ] ;then
   echo "Input illegal number of parameters " $#
   echo "Need 1 parameters for the architectures"
@@ -7,8 +7,18 @@ fi
 
 arch=$1
 SAVED=./output/NAS-RNN/Search-${arch}-WT2
+PY_C="./env/bin/python"
 
-python ./exps-rnn/train_rnn_base.py \
+if [ ! -f ${PY_C} ]; then
+  echo "Local Run with Python: "`which python`
+  PY_C="python"
+else
+  echo "Cluster Run with Python: "${PY_C}
+fi
+
+${PY_C} --version
+
+${PY_C} ./exps-rnn/train_rnn_base.py \
 	--arch ${arch} \
 	--save_path ${SAVED} \
 	--config_path ./configs/NAS-WT2-BASE.config \