/[pdpsoft]/nl.nikhef.ndpf.mcfloat/trunk/mcfloat
ViewVC logotype

Diff of /nl.nikhef.ndpf.mcfloat/trunk/mcfloat

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 2708 by templon, Thu May 1 08:21:16 2014 UTC revision 2770 by templon, Thu Mar 12 12:54:43 2015 UTC
# Line 1  Line 1 
1  #!/usr/bin/env python  #!/usr/bin/env python
2  from torque_utils import pbsnodes  from torque_utils import pbsnodes
3    
4    # this config allows mcore to run up to full ATLAS allocation (27.02.2015)
5    MAXDRAIN = 32            # max num of nodes allowed to drain
6    MAXFREE  = 49            # max num of free slots to tolerate
7    MAXFREEPERNODE = 3
8    NODESPERGRAB = 3         # number of nodes to grab each time around
9    CANDIDATE_NODES = [ 'wn-car-0%02d.farm.nikhef.nl' % (n) for n in range(1,97) ] + \
10                      [ 'wn-knal-0%02d.farm.nikhef.nl' % (n) for n in range(1,19) ]
11    MCQUEUE  = 'atlasmc'
12    
13  # settings below appropriate for knal nodes (32 cores)  # settings below appropriate for knal nodes (32 cores)
14  # ... want to keep num of draining nodes  # ... want to keep num of draining nodes
15  # to MAXFREE / 7 since giving back a node could be expensive.  # to MAXFREE / 3 since giving back a node could be expensive.
16  # it might already be running several mc jobs  # it might already be running several mc jobs
17    
18  MAXDRAIN = 7             # max num of nodes allowed to drain  # MAXDRAIN = 16            # max num of nodes allowed to drain
19  MAXFREE  = 49            # max num of free slots to tolerate  # MAXFREE  = 49            # max num of free slots to tolerate
20  CANDIDATE_NODES = [ 'wn-knal-0%02d.farm.nikhef.nl' % (n) for n in range(1,19) ]  # MAXFREEPERNODE = 3
21  MCQUEUE  = 'atlasmc'  # NODESPERGRAB = 1
22    # CANDIDATE_NODES = [ 'wn-knal-0%02d.farm.nikhef.nl' % (n) for n in range(1,19) ]
23    # MCQUEUE  = 'atlasmc'
24    
25  # settings below more appropriate for smrt nodes (8 core)  # settings below more appropriate for smrt nodes (8 core)
26  # here giving a node back doesn't really matter, it wasn't  # here giving a node back doesn't really matter, it wasn't
# Line 104  def getmcjobinfo(nodes): Line 115  def getmcjobinfo(nodes):
115      for n in nodes:      for n in nodes:
116          rawlist.append( (n.name, n, running_mc[n.name], n.freeCpu) )          rawlist.append( (n.name, n, running_mc[n.name], n.freeCpu) )
117      from operator import itemgetter      from operator import itemgetter
118      slist = sorted(rawlist, key=lambda x: (x[2], x[3]))      def srtrunningfree(item):
119            # sort optimal for dropping nodes; for nodes with zero running mc jobs, want to
120            # drop least-drained nodes (fewer free slots first in list); for nodes with
121            # running mc jobs, want to shed as few nodes as possible so pick the more-drained
122            # nodes first.  this probably also works for shedding nodes when queue dries up.
123            mcrun      = item[2]
124            emptyslots = item[3]
125            if mcrun == 0:
126                rank = emptyslots
127            else:
128                rank = 32*mcrun - emptyslots  # 32 just a number bigger than expected emptyslots value
129            return rank
130        slist = sorted(rawlist, key=srtrunningfree)
131    
132      return waitingJobs, slist      return waitingJobs, slist
133    
# Line 220  nodes_with_too_few_jobs = list() Line 243  nodes_with_too_few_jobs = list()
243    
244  for node in mcdedicated:  for node in mcdedicated:
245      logging.debug(node.name + " has " + repr(node.freeCpu) + " free slots")      logging.debug(node.name + " has " + repr(node.freeCpu) + " free slots")
246      if node.freeCpu > 7:      if node.freeCpu > MAXFREEPERNODE:
247          nodes_with_too_few_jobs.append(node)          nodes_with_too_few_jobs.append(node)
248    
249  logging.debug("there are " + repr(len(nodes_with_too_few_jobs)) + \  logging.debug("there are " + repr(len(nodes_with_too_few_jobs)) + \
# Line 283  if len(nodes_with_too_few_jobs) > 0 or r Line 306  if len(nodes_with_too_few_jobs) > 0 or r
306    
307  logging.debug("There are " + repr(len(mcdedicated)) + " dedicated nodes and " + \  logging.debug("There are " + repr(len(mcdedicated)) + " dedicated nodes and " + \
308      repr(draining_slots) + " unused slots")      repr(draining_slots) + " unused slots")
309  if (MAXFREE - draining_slots) >= 7:  if (MAXFREE - draining_slots) >= MAXFREEPERNODE:
310      logging.debug("%d unused slots are permitted, so %d more" % (MAXFREE, MAXFREE - draining_slots))      logging.debug("%d unused slots are permitted, so %d more" % (MAXFREE, MAXFREE - draining_slots))
311      logging.debug("headroom of 7+ slots means we can try to grab another node")      logging.debug("headroom of more than %d+ slots means we can try to grab another node" % (MAXFREEPERNODE) )
312      if draining_nodes < MAXDRAIN :      if draining_nodes < MAXDRAIN :
313          # first check if there are actually any waiting jobs to run; if not makes no sense to grab a node.          # first check if there are actually any waiting jobs to run; if not makes no sense to grab a node.
314          waiting, node_tuples = getmcjobinfo(mcdedicated)          waiting, node_tuples = getmcjobinfo(mcdedicated)
# Line 304  if (MAXFREE - draining_slots) >= 7: Line 327  if (MAXFREE - draining_slots) >= 7:
327          if len(candidate_nodes) < 1:          if len(candidate_nodes) < 1:
328              logging.debug("no more nodes, bailing out, nothing more I can do")              logging.debug("no more nodes, bailing out, nothing more I can do")
329              sys.exit(0)              sys.exit(0)
330          grabbed_node=random.choice(candidate_nodes)          for nn in range(min(NODESPERGRAB,len(candidate_nodes))):
331          logging.info("%s added to mc node pool" % (grabbed_node.name))  #            logging.debug("found %d candidate nodes to dedicate to mc" % len(candidate_nodes))
332          if not opts.noopt: add_to_mc_pool(grabbed_node)              grabbed_node=random.choice(candidate_nodes)
333                logging.info("%s added to mc node pool" % (grabbed_node.name))
334                candidate_nodes.remove(grabbed_node)
335                if not opts.noopt: add_to_mc_pool(grabbed_node)
336      else:      else:
337          logging.debug("There are %d nodes with unused slots (draining)" % (draining_nodes))          logging.debug("There are %d nodes with unused slots (draining)" % (draining_nodes))
338          logging.debug("This equals or exceeds the configured max of %d" % (MAXDRAIN))          logging.debug("This equals or exceeds the configured max of %d" % (MAXDRAIN))
# Line 332  elif draining_slots > MAXFREE: Line 358  elif draining_slots > MAXFREE:
358              slots_recovered += unused_slots              slots_recovered += unused_slots
359  else:  else:
360      logging.debug("%d unused slots of allowed %d" % (draining_slots, MAXFREE))      logging.debug("%d unused slots of allowed %d" % (draining_slots, MAXFREE))
361      logging.debug("difference is %d which is less than 7" % \      logging.debug("difference is %d which is less than %d" % \
362                    (MAXFREE - draining_slots) )                    (MAXFREE - draining_slots, MAXFREEPERNODE) )
363      logging.debug("so: doing nothing now.")      logging.debug("so: doing nothing now.")

Legend:
Removed from v.2708  
changed lines
  Added in v.2770

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28