diff --git a/configs/nas-cifar-cos-cutB96.config b/configs/nas-cifar-cos-cutB96.config new file mode 100644 index 0000000..830ffbd --- /dev/null +++ b/configs/nas-cifar-cos-cutB96.config @@ -0,0 +1,14 @@ +{ + "type" : ["str", "cosine"], + "batch_size": ["int", 96], + "epochs" : ["int", 600], + "momentum" : ["float", 0.9], + "decay" : ["float", 0.0003], + "LR" : ["float", 0.025], + "LR_MIN" : ["float", 0.0001], + "auxiliary" : ["bool", 1], + "auxiliary_weight" : ["float", 0.4], + "grad_clip" : ["float", 5], + "cutout" : ["int", 16], + "drop_path_prob" : ["float", 0.2] +} diff --git a/exps-cnn/train_base.py b/exps-cnn/train_base.py index d25d0a0..6e8c003 100644 --- a/exps-cnn/train_base.py +++ b/exps-cnn/train_base.py @@ -36,6 +36,9 @@ parser.add_argument('--print_freq', type=int, help='print frequency (default: parser.add_argument('--manualSeed', type=int, help='manual seed') args = parser.parse_args() +if 'CUDA_VISIBLE_DEVICES' not in os.environ: print('Can not find CUDA_VISIBLE_DEVICES in os.environ') +else : print('Find CUDA_VISIBLE_DEVICES={:}'.format(os.environ['CUDA_VISIBLE_DEVICES'])) + assert torch.cuda.is_available(), 'torch.cuda is not available' diff --git a/scripts-cluster/submit.sh b/scripts-cluster/submit.sh index 531d078..43c26bb 100644 --- a/scripts-cluster/submit.sh +++ b/scripts-cluster/submit.sh @@ -22,19 +22,23 @@ TIME=$(date +"%Y-%h-%d--%T") TIME="${TIME//:/-}" JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" +HDFS_DIR="/user/COMM_KM_Data/${USER}/logs/alljobs/${TIME}" echo "JOB-SCRIPT: "${JOB_SCRIPT} cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} echo ${CMD} >> ${JOB_SCRIPT} -HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" +${HDP} -mkdir ${HDFS_DIR} +echo "Create "${HDFS_DIR}" done!" +sleep 1s +HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" ${HGCP_CLIENT_BIN}/submit \ --hdfs afs://xingtian.afs.baidu.com:9902 \ --hdfs-user COMM_KM_Data \ --hdfs-passwd COMM_km_2018 \ - --hdfs-path /user/COMM_KM_Data/dongxuanyi/logs \ + --hdfs-path ${HDFS_DIR} \ --file-dir ./ \ --job-name ${NAME} \ --queue-name ${QUEUE} \ diff --git a/scripts-cnn/train-imagenet.sh b/scripts-cnn/train-imagenet.sh index a0152ed..9cb5167 100644 --- a/scripts-cnn/train-imagenet.sh +++ b/scripts-cnn/train-imagenet.sh @@ -25,7 +25,7 @@ if [ ! -f ${PY_C} ]; then else echo "Cluster Run with Python: "${PY_C} echo "Unzip ILSVRC2012" - tar xvf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} + tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} fi ${PY_C} --version