run_and_encode.sh 2.42 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/bash
#PBS -l walltime=6:00:00
#PBS -N run_and_encode
#PBS -l nodes=256:ppn=32:xe+128:ppn=16:xk
#PBS -l flags=commtransparent

module load bwpy
module load bwpy-mpi
source ~/Development/hpcmongodb/virenv/bin/activate

START_DATE="20180101"
END_DATE="20180201"
CHUNK_FACTOR=5

export CONFIG_SVRS_NUMBER_OF_INSTANCES=7
export ROUTER_SVRS_NUMBER_OF_INSTANCES=256
export ROUTER_SVRS_CONCURRENCY_PER_NODE=4
export SHARD_SVRS_NUMBER_OF_INSTANCES=256
export SHARD_SVRS_CONCURRENCY_PER_NODE=2

INIT_NUM_CHUNKS=$(( ${CHUNK_FACTOR} * ${SHARD_SVRS_NUMBER_OF_INSTANCES} ))
export MONGO_BIN=/opt/mongodb/4.0.5/bin
export MONGO_BASE_DIR=/u/staff/saxton/scratch/hpcmongodb/metric_store_${SHARD_SVRS_NUMBER_OF_INSTANCES}_shards_${START_DATE}_${END_DATE}
export MONGO_TMP=${MONGO_BASE_DIR}
export USE_MEMORY_AS_DISK=true
export READ_ONLY=true
export INIT_EVAL_STR="sh.enableSharding(\"monitoringData\")
  sh.shardCollection( \"monitoringData.metricData\", { k_to_h : \"hashed\" }, false, { numInitialChunks: ${INIT_NUM_CHUNKS} } )
  sh.disableBalancing(\"monitoringData.metricData\")
  sh.shardCollection( \"monitoringData.torqueData\", { jobid : \"hashed\" }, false, { numInitialChunks: ${INIT_NUM_CHUNKS} } )
  sh.disableBalancing(\"monitoringData.torqueData\")
  monitoringData = db.getSiblingDB(\"monitoringData\")
  monitoringData.metricData.createIndex({CompId: 1})
  monitoringData.metricData.createIndex({\"#Time\": 1})
  monitoringData.torqueData.createIndex({dateTime: 1})
  sh.disableAutoSplit()
  config = db.getSiblingDB(\"config\")
  config.settings.save( { _id:\"chunksize\", value: 1024 } )"

SRC_DIR="/mnt/a/u/staff/saxton/Development/hpcmongodb" # "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
export PROG_BOOT_LOCK_FILE=${SRC_DIR}/cluster_run/prog_boot_lock.lock

touch ${PROG_BOOT_LOCK_FILE}
cd ${SRC_DIR}/cluster_run

touch ${PROG_BOOT_LOCK_FILE}

./run.sh &
RUN_PID=$!

echo "sleep waitintg for cluster to become available"
while [  -f ${PROG_BOOT_LOCK_FILE} ]
do
  sleep 2
done

if [ -f $MONGO_BASE_DIR/mongo_force_shutdown.sem ]
then
    echo "Cluster Failed Startup, exiting"
    exit 256
else
    echo "Cluster is alive!"
fi

cd /u/staff/saxton/Development/rnnsystemmonitor
echo "changed directory to $(pwd). running aprun"
export NUM_WORKERS=128
H=$( cat ${MONGO_BASE_DIR}/router_svrs_and_ports.txt )
aprun -n ${NUM_WORKERS} -N 1 -- python test_bed.py --hosts $H > encode_log_$(date +"%d-%m-%Y-%H-%M-%S").txt

kill -SIGUSR1 ${RUN_PID}

wait