1 |
#!/usr/bin/env python |
#!/usr/bin/env python |
2 |
from torque_utils import pbsnodes |
from torque_utils import pbsnodes |
3 |
|
|
4 |
# this config allows mcore to run up to full ATLAS allocation (27.02.2015) |
# this config allows mcore to run up to full ATLAS allocation (07.04.2015) |
5 |
|
# full allocation right now is 47% of "non-retired" capacity = knal + mars + car |
6 |
|
# just configure 47% of each class. |
7 |
|
|
8 |
MAXDRAIN = 32 # max num of nodes allowed to drain |
MAXDRAIN = 32 # max num of nodes allowed to drain |
9 |
MAXFREE = 49 # max num of free slots to tolerate |
MAXFREE = 49 # max num of free slots to tolerate |
10 |
MAXFREEPERNODE = 3 |
MAXFREEPERNODE = 3 |
11 |
NODESPERGRAB = 3 # number of nodes to grab each time around |
NODESPERGRAB = 3 # number of nodes to grab each time around |
12 |
CANDIDATE_NODES = [ 'wn-car-0%02d.farm.nikhef.nl' % (n) for n in range(1,97) ] + \ |
|
13 |
[ 'wn-knal-0%02d.farm.nikhef.nl' % (n) for n in range(1,19) ] |
# parameterize percent capacity -- 47% is too conservative, make this variable. |
14 |
|
|
15 |
|
CAPACFRAC = 0.60 |
16 |
|
CANDIDATE_NODES = [ 'wn-car-0%02d.farm.nikhef.nl' % (n) for n in range(1, int(CAPACFRAC*96)+1) ] + \ |
17 |
|
[ 'wn-knal-0%02d.farm.nikhef.nl' % (n) for n in range(1,int(CAPACFRAC*18)+1) ] + \ |
18 |
|
[ 'wn-mars-0%02d.farm.nikhef.nl' % (n) for n in range(20,20+int(CAPACFRAC*57)+1) ] |
19 |
|
|
20 |
MCQUEUE = 'atlasmc' |
MCQUEUE = 'atlasmc' |
21 |
|
|
22 |
# settings below appropriate for knal nodes (32 cores) |
# settings below appropriate for knal nodes (32 cores) |
192 |
p = optparse.OptionParser(description="Monitor state of multicore pool and adjust size as needed", |
p = optparse.OptionParser(description="Monitor state of multicore pool and adjust size as needed", |
193 |
usage=usage) |
usage=usage) |
194 |
|
|
195 |
p.add_option("-n",action="store_true",dest="noopt",default=False, |
p.add_option("-A",action="store_true",dest="addall",default=False, |
196 |
help="don't do anything, just print what would have been done") |
help="add all eligible nodes to multicore pool") |
197 |
p.add_option("-l",action="store",dest="logfile",default=None, |
p.add_option("-l",action="store",dest="logfile",default=None, |
198 |
help="log actions and information to LOGFILE (default stdout)") |
help="log actions and information to LOGFILE (default stdout)") |
199 |
p.add_option("-L",action="store",dest="loglevel",default="INFO", |
p.add_option("-L",action="store",dest="loglevel",default="INFO", |
200 |
help="print messages of LOGLEVEL (DEBUG, INFO, WARNING, ..., default INFO") |
help="print messages of LOGLEVEL (DEBUG, INFO, WARNING, ..., default INFO") |
201 |
|
p.add_option("-n",action="store_true",dest="noopt",default=False, |
202 |
|
help="don't do anything, just print what would have been done") |
203 |
|
p.add_option("-i",action="store_true",dest="info",default=False, |
204 |
|
help="print info on all nodes in multicore pool") |
205 |
|
|
206 |
import logging |
import logging |
207 |
|
|
250 |
draining_slots += node.freeCpu |
draining_slots += node.freeCpu |
251 |
if node.freeCpu > 0 : draining_nodes += 1 |
if node.freeCpu > 0 : draining_nodes += 1 |
252 |
|
|
253 |
|
if opts.info: |
254 |
|
waiting, node_tuples = getmcjobinfo(mcdedicated) |
255 |
|
for t in node_tuples: |
256 |
|
if t[2] == 0: |
257 |
|
print "%28s has %2d running mc jobs, %2d empty slots" % (t[0], t[2], t[3]) |
258 |
|
else: |
259 |
|
print "%28s has %2d running mc jobs, %2d empty slots, ratio %4.1f" % (t[0], t[2], t[3], float(t[3])/t[2]) |
260 |
|
sys.exit(0) |
261 |
|
candidate_nodes = list() |
262 |
|
for n in mcnodes: |
263 |
|
if n not in mcdedicated: |
264 |
|
candidate_nodes.append(n) |
265 |
|
|
266 |
|
if opts.addall: |
267 |
|
for grabbed_node in candidate_nodes: |
268 |
|
logging.info("%s added to mc node pool" % (grabbed_node.name)) |
269 |
|
# candidate_nodes.remove(grabbed_node) |
270 |
|
if not opts.noopt: add_to_mc_pool(grabbed_node) |
271 |
|
sys.exit(0) |
272 |
|
|
273 |
# check for dedicated nodes with too few jobs |
# check for dedicated nodes with too few jobs |
274 |
|
|
275 |
nodes_with_too_few_jobs = list() |
nodes_with_too_few_jobs = list() |
352 |
(MAXDRAIN, draining_nodes)) |
(MAXDRAIN, draining_nodes)) |
353 |
logging.debug("there are also waiting jobs: try to grab another node") |
logging.debug("there are also waiting jobs: try to grab another node") |
354 |
# build a list of candidates to grab |
# build a list of candidates to grab |
|
candidate_nodes = list() |
|
|
for n in mcnodes: |
|
|
if n not in mcdedicated: |
|
|
candidate_nodes.append(n) |
|
355 |
logging.debug("found %d candidate nodes to dedicate to mc" % len(candidate_nodes)) |
logging.debug("found %d candidate nodes to dedicate to mc" % len(candidate_nodes)) |
356 |
if len(candidate_nodes) < 1: |
if len(candidate_nodes) < 1: |
357 |
logging.debug("no more nodes, bailing out, nothing more I can do") |
logging.debug("no more nodes, bailing out, nothing more I can do") |