Logo ROOT  
Reference Guide
TCondor.cxx
Go to the documentation of this file.
1 // @(#)root/proof:$Id$
2 // Author: Maarten Ballintijn 06/12/03
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2001, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////////
13 // //
14 // TCondor //
15 // //
16 // Interface to the Condor system. TCondor provides a (partial) API for //
17 // querying and controlling the Condor system, including experimental //
18 // extensions like COD (computing on demand) //
19 // //
20 //////////////////////////////////////////////////////////////////////////
21 
22 #include <stdlib.h>
23 
24 #include "TCondor.h"
25 #include "TList.h"
26 #include "TSystem.h"
27 #include "TObjString.h"
28 #include "TRegexp.h"
29 #include "TProofDebug.h"
30 #include "Riostream.h"
31 #include "TEnv.h"
32 #include "TClass.h"
33 
36 
37 
38 ////////////////////////////////////////////////////////////////////////////////
39 /// Create Condor interface object. Uses Condor apps since there is no
40 /// API yet.
41 
42 TCondor::TCondor(const char *pool) : fPool(pool), fState(kFree)
43 {
44  fClaims = new TList;
45 
46  // Setup Condor
47 
48  TString condorHome = gEnv->GetValue("Proof.CondorHome", (char*)0);
49  if (condorHome != "") {
50  TString path = gSystem->Getenv("PATH");
51  path = condorHome + "/bin:" + path;
52  gSystem->Setenv("PATH",path);
53  }
54 
55  TString condorConf = gEnv->GetValue("Proof.CondorConfig", (char*)0);
56  if (condorConf != "") {
57  gSystem->Setenv("CONDOR_CONFIG",condorConf);
58  }
59 
60  char *loc = gSystem->Which(gSystem->Getenv("PATH"), "condor_cod",
62 
63  if (loc) {
64  fValid = kTRUE;
65  delete [] loc;
66  } else {
67  fValid = kFALSE;
68  }
69 }
70 
71 
72 ////////////////////////////////////////////////////////////////////////////////
73 /// Cleanup Condor interface.
74 
76 {
77  PDB(kCondor,1) Info("~TCondor","fState %d", fState );
78 
79  if (fState != kFree) {
80  Release();
81  }
82  delete fClaims;
83 }
84 
85 
86 ////////////////////////////////////////////////////////////////////////////////
87 /// Print master status
88 
89 void TCondor::Print(Option_t * opt) const
90 {
91  std::cout << "OBJ: " << IsA()->GetName()
92  << "\tPool: \"" << fPool << "\""
93  << "\tState: " << fState << std::endl;
94  fClaims->Print(opt);
95 }
96 
97 
98 ////////////////////////////////////////////////////////////////////////////////
99 /// Claim a VirtualMachine for PROOF usage.
100 
101 TCondorSlave *TCondor::ClaimVM(const char *vm, const char *cmd)
102 {
103  Int_t port = 0;
104 
105  TString claimCmd = Form("condor_cod request -name %s -timeout 10 2>>%s/condor.proof.%d",
106  vm, gSystem->TempDirectory(), gSystem->GetUid() );
107 
108  PDB(kCondor,2) Info("ClaimVM","command: %s", claimCmd.Data());
109  FILE *pipe = gSystem->OpenPipe(claimCmd, "r");
110 
111  if (!pipe) {
112  SysError("ClaimVM","cannot run command: %s", claimCmd.Data());
113  return 0;
114  }
115 
116  TString claimId;
117  TString line;
118  while (line.Gets(pipe)) {
119  PDB(kCondor,3) Info("ClaimVM","line = %s", line.Data());
120 
121  if (line.BeginsWith("ClaimId = \"")) {
122  line.Remove(0, line.Index("\"")+1);
123  line.Chop(); // remove trailing "
124  claimId = line;
125  PDB(kCondor,1) Info("ClaimVM","claim = '%s'", claimId.Data());
126  TRegexp r("[0-9]*$");
127  TString num = line(r);
128  port = 37000 + atoi(num.Data());
129  PDB(kCondor,1) Info("ClaimVM","port = %d", port);
130  }
131  }
132 
133  Int_t r = gSystem->ClosePipe(pipe);
134  if (r) {
135  Error("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
136  return 0;
137  } else {
138  PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
139  }
140 
141  TString jobad("jobad");
142  FILE *jf = gSystem->TempFileName(jobad);
143 
144  if (jf == 0) return 0;
145 
146  TString str(cmd);
147  str.ReplaceAll("$(Port)", Form("%d", port));
148  fputs(str, jf);
149 
150  fclose(jf);
151 
152  TString activateCmd = Form("condor_cod activate -id '%s' -jobad %s",
153  claimId.Data(), jobad.Data() );
154 
155  PDB(kCondor,2) Info("ClaimVM","command: %s", activateCmd.Data());
156  pipe = gSystem->OpenPipe(activateCmd, "r");
157 
158  if (!pipe) {
159  SysError("ClaimVM","cannot run command: %s", activateCmd.Data());
160  return 0;
161  }
162 
163  while (line.Gets(pipe)) {
164  PDB(kCondor,3) Info("ClaimVM","Activate: line = %s", line.Data());
165  }
166 
167  r = gSystem->ClosePipe(pipe);
168  if (r) {
169  Error("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
170  } else {
171  PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
172  }
173 
174  gSystem->Unlink(jobad);
175 
176  // TODO: get info at the start for all nodes ...
177  TCondorSlave *claim = new TCondorSlave;
178  claim->fClaimID = claimId;
179  TString node(vm);
180  node = node.Remove(0, node.Index("@")+1);
181  claim->fHostname = node;
182  claim->fPort = port;
183  claim->fPerfIdx = 100; //set performance index to 100 by default
184  claim->fImage = node; //set image to hostname by default
185 
186  return claim;
187 }
188 
189 
190 ////////////////////////////////////////////////////////////////////////////////
191 /// Get the names of the virtual machines in the pool.
192 /// Return a TList of TObjString or 0 in case of failure
193 
195 {
196  TString poolopt = fPool ? Form("-pool %s", fPool.Data()) : "";
197  TString cmd = Form("condor_status %s -format \"%%s\\n\" Name", poolopt.Data());
198 
199  PDB(kCondor,2) Info("GetVirtualMachines","command: %s", cmd.Data());
200 
201  FILE *pipe = gSystem->OpenPipe(cmd, "r");
202 
203  if (!pipe) {
204  SysError("GetVirtualMachines","cannot run command: %s", cmd.Data());
205  return 0;
206  }
207 
208  TString line;
209  TList *l = new TList;
210  while (line.Gets(pipe)) {
211  PDB(kCondor,3) Info("GetVirtualMachines","line = %s", line.Data());
212  if (line != "") l->Add(new TObjString(line));
213  }
214 
215  Int_t r = gSystem->ClosePipe(pipe);
216  if (r) {
217  delete l;
218  Error("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
219  return 0;
220  } else {
221  PDB(kCondor,1) Info("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
222  }
223 
224  return l;
225 }
226 
227 
228 ////////////////////////////////////////////////////////////////////////////////
229 /// Claim n virtual machines
230 /// This function figures out the image and performance index before returning
231 /// the list of condor slaves
232 
233 TList *TCondor::Claim(Int_t n, const char *cmd)
234 {
235  if (fState != kFree) {
236  Error("Claim","not in state Free");
237  return 0;
238  }
239 
240  TList *vms = GetVirtualMachines();
241  TIter next(vms);
242  TObjString *vm;
243  for(Int_t i=0; i < n && (vm = (TObjString*) next()) != 0; i++ ) {
244  TCondorSlave *claim = ClaimVM(vm->GetName(), cmd);
245  if (claim != 0) {
246  if ( !GetVmInfo(vm->GetName(), claim->fImage, claim->fPerfIdx) ) {
247  // assume vm is gone
248  delete claim;
249  } else {
250  fClaims->Add(claim);
251  fState = kActive;
252  }
253  }
254  }
255 
256  vms->Delete();
257  delete vms;
258 
259  return fClaims;
260 }
261 
262 
263 ////////////////////////////////////////////////////////////////////////////////
264 /// Claim virtual machine with name vmname
265 /// This function does not figure out the image and performance index before
266 /// returning the condor slave
267 
268 TCondorSlave *TCondor::Claim(const char *vmname, const char *cmd)
269 {
270  if (fState != kFree && fState != kActive) {
271  Error("Claim","not in state Free or Active");
272  return 0;
273  }
274 
275  TCondorSlave *claim = ClaimVM(vmname, cmd);
276  if (claim != 0) {
277  fClaims->Add(claim);
278  fState = kActive;
279  }
280 
281  return claim;
282 }
283 
284 
285 ////////////////////////////////////////////////////////////////////////////////
286 /// Set the state of workers
287 
289 {
290  PDB(kCondor,1) Info("SetState","state: %s (%lld)",
291  state == kSuspended ? "kSuspended" : "kActive", Long64_t(gSystem->Now()));
292  TIter next(fClaims);
293  TCondorSlave *claim;
294  while((claim = (TCondorSlave*) next()) != 0) {
295  TString cmd = Form("condor_cod %s -id '%s'",
296  state == kSuspended ? "suspend" : "resume",
297  claim->fClaimID.Data());
298 
299  PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
300  FILE *pipe = gSystem->OpenPipe(cmd, "r");
301 
302  if (!pipe) {
303  SysError("SetState","cannot run command: %s", cmd.Data());
304  return kFALSE;
305  }
306 
307  TString line;
308  while (line.Gets(pipe)) {
309  PDB(kCondor,3) Info("SetState","line = %s", line.Data());
310  }
311 
312  Int_t r = gSystem->ClosePipe(pipe);
313  if (r) {
314  Error("SetState","command: %s returned %d", cmd.Data(), r);
315  return kFALSE;
316  } else {
317  PDB(kCondor,1) Info("SetState","command: %s returned %d", cmd.Data(), r);
318  }
319  }
320 
321  fState = state;
322  return kTRUE;
323 }
324 
325 
326 ////////////////////////////////////////////////////////////////////////////////
327 /// Suspend worker
328 
330 {
331  if (fState != kActive) {
332  Error("Suspend","not in state Active");
333  return kFALSE;
334  }
335 
336  return SetState(kSuspended);
337 }
338 
339 
340 ////////////////////////////////////////////////////////////////////////////////
341 /// Resume worker
342 
344 {
345  if (fState != kSuspended) {
346  Error("Suspend","not in state Suspended");
347  return kFALSE;
348  }
349 
350  return SetState(kActive);
351 }
352 
353 
354 ////////////////////////////////////////////////////////////////////////////////
355 /// Release worker
356 
358 {
359  if (fState == kFree) {
360  Error("Suspend","not in state Active or Suspended");
361  return kFALSE;
362  }
363 
364  TCondorSlave *claim;
365  while((claim = (TCondorSlave*) fClaims->First()) != 0) {
366  TString cmd = Form("condor_cod release -id '%s'", claim->fClaimID.Data());
367 
368  PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
369  FILE *pipe = gSystem->OpenPipe(cmd, "r");
370 
371  if (!pipe) {
372  SysError("Release","cannot run command: %s", cmd.Data());
373  return kFALSE;
374  }
375 
376  TString line;
377  while (line.Gets(pipe)) {
378  PDB(kCondor,3) Info("Release","line = %s", line.Data());
379  }
380 
381  Int_t r = gSystem->ClosePipe(pipe);
382  if (r) {
383  Error("Release","command: %s returned %d", cmd.Data(), r);
384  return kFALSE;
385  } else {
386  PDB(kCondor,1) Info("Release","command: %s returned %d", cmd.Data(), r);
387  }
388 
389  fClaims->Remove(claim);
390  delete claim;
391  }
392 
393  fState = kFree;
394  return kTRUE;
395 }
396 
397 
398 ////////////////////////////////////////////////////////////////////////////////
399 /// Get info about worker status
400 
401 Bool_t TCondor::GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
402 {
403  TString cmd = Form("condor_status -format \"%%d:\" Mips -format \"%%s\\n\" FileSystemDomain "
404  "-const 'Name==\"%s\"'", vm);
405 
406  PDB(kCondor,2) Info("GetVmInfo","command: %s", cmd.Data());
407  FILE *pipe = gSystem->OpenPipe(cmd, "r");
408 
409  if (!pipe) {
410  SysError("GetVmInfo","cannot run command: %s", cmd.Data());
411  return kFALSE;
412  }
413 
414  TString line;
415  while (line.Gets(pipe)) {
416  PDB(kCondor,3) Info("GetVmInfo","line = %s", line.Data());
417  if (line != "") {
418  TString amips = line(TRegexp("^[0-9]*"));
419  perfidx = atoi(amips);
420  image = line(TRegexp("[^:]+$"));
421  break;
422  }
423  }
424 
425  Int_t r = gSystem->ClosePipe(pipe);
426  if (r) {
427  Error("GetVmInfo","command: %s returned %d", cmd.Data(), r);
428  return kFALSE;
429  } else {
430  PDB(kCondor,1) Info("GetVmInfo","command: %s returned %d", cmd.Data(), r);
431  }
432 
433  return kTRUE;
434 }
435 
436 
437 ////////////////////////////////////////////////////////////////////////////////
438 /// Get image of the worker
439 
440 TString TCondor::GetImage(const char *host) const
441 {
442  TString cmd = Form("condor_status -direct %s -format \"Image:%%s\\n\" "
443  "FileSystemDomain", host);
444 
445  PDB(kCondor,2) Info("GetImage","command: %s", cmd.Data());
446 
447  FILE *pipe = gSystem->OpenPipe(cmd, "r");
448 
449  if (!pipe) {
450  SysError("GetImage","cannot run command: %s", cmd.Data());
451  return "";
452  }
453 
454  TString image;
455  TString line;
456  while (line.Gets(pipe)) {
457  PDB(kCondor,3) Info("GetImage","line = %s", line.Data());
458  if (line != "") {
459  image = line(TRegexp("[^:]+$"));
460  break;
461  }
462  }
463 
464  Int_t r = gSystem->ClosePipe(pipe);
465  if (r) {
466  Error("GetImage","command: %s returned %d", cmd.Data(), r);
467  return "";
468  } else {
469  PDB(kCondor,1) Info("GetImage","command: %s returned %d", cmd.Data(), r);
470  }
471 
472  return image;
473 }
474 
475 
476 ////////////////////////////////////////////////////////////////////////////////
477 /// Print worker status
478 
479 void TCondorSlave::Print(Option_t * /*opt*/ ) const
480 {
481  std::cout << "OBJ: " << IsA()->GetName()
482  << " " << fHostname << ":" << fPort
483  << " Perf: " << fPerfIdx
484  << " Image: " << fImage << std::endl;
485 }
l
auto * l
Definition: textangle.C:4
TSystem::Unlink
virtual int Unlink(const char *name)
Unlink, i.e.
Definition: TSystem.cxx:1379
n
const Int_t n
Definition: legend1.C:16
TCondor
Definition: TCondor.h:52
TCondor::Release
Bool_t Release()
Release worker.
Definition: TCondor.cxx:357
kTRUE
const Bool_t kTRUE
Definition: RtypesCore.h:91
TObject::SysError
virtual void SysError(const char *method, const char *msgfmt,...) const
Issue system error message.
Definition: TObject.cxx:904
TCondorSlave
Definition: TCondor.h:34
TCollection::Print
virtual void Print(Option_t *option="") const
Default print for collections, calls Print(option, 1).
Definition: TCollection.cxx:476
TSystem::Setenv
virtual void Setenv(const char *name, const char *value)
Set environment variable.
Definition: TSystem.cxx:1645
PDB
#define PDB(mask, level)
Definition: TProofDebug.h:56
TCondor::GetVmInfo
Bool_t GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
Get info about worker status.
Definition: TCondor.cxx:401
TCondor::Print
void Print(Option_t *option="") const
Print master status.
Definition: TCondor.cxx:89
gEnv
R__EXTERN TEnv * gEnv
Definition: TEnv.h:171
TCondor::GetVirtualMachines
TList * GetVirtualMachines() const
Get the names of the virtual machines in the pool.
Definition: TCondor.cxx:194
TList::Delete
virtual void Delete(Option_t *option="")
Remove all objects from the list AND delete all heap based objects.
Definition: TList.cxx:469
TString::Data
const char * Data() const
Definition: TString.h:369
TSystem::Which
virtual char * Which(const char *search, const char *file, EAccessMode mode=kFileExists)
Find location of file in a search path.
Definition: TSystem.cxx:1544
TProofDebug.h
ClassImp
#define ClassImp(name)
Definition: Rtypes.h:364
Form
char * Form(const char *fmt,...)
TObjString.h
TCondor::Suspend
Bool_t Suspend()
Suspend worker.
Definition: TCondor.cxx:329
r
ROOT::R::TRInterface & r
Definition: Object.C:4
TObject::Info
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition: TObject.cxx:864
Long64_t
long long Long64_t
Definition: RtypesCore.h:73
TObject::Error
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:890
TCondor::kFree
@ kFree
Definition: TCondor.h:54
TClass.h
TList.h
TEnv::GetValue
virtual Int_t GetValue(const char *name, Int_t dflt) const
Returns the integer value for a resource.
Definition: TEnv.cxx:491
TCondor::EState
EState
Definition: TCondor.h:54
TSystem::TempFileName
virtual FILE * TempFileName(TString &base, const char *dir=nullptr)
Create a secure temporary file by appending a unique 6 letter string to base.
Definition: TSystem.cxx:1495
TEnv.h
TString
Definition: TString.h:136
TCondorSlave::Print
void Print(Option_t *option="") const
Print worker status.
Definition: TCondor.cxx:479
TCondor::~TCondor
virtual ~TCondor()
Cleanup Condor interface.
Definition: TCondor.cxx:75
bool
TString::ReplaceAll
TString & ReplaceAll(const TString &s1, const TString &s2)
Definition: TString.h:692
TCondorSlave::fPort
Int_t fPort
Definition: TCondor.h:37
TObjString
Definition: TObjString.h:28
TList::First
virtual TObject * First() const
Return the first object in the list. Returns 0 when list is empty.
Definition: TList.cxx:658
TCondor::ClaimVM
TCondorSlave * ClaimVM(const char *vm, const char *cmd)
Claim a VirtualMachine for PROOF usage.
Definition: TCondor.cxx:101
TRegexp.h
Option_t
const typedef char Option_t
Definition: RtypesCore.h:66
TCondorSlave::fPerfIdx
Int_t fPerfIdx
Definition: TCondor.h:38
kExecutePermission
@ kExecutePermission
Definition: TSystem.h:45
TSystem.h
TCondorSlave::fImage
TString fImage
Definition: TCondor.h:39
TCondor::GetImage
TString GetImage(const char *host) const
Get image of the worker.
Definition: TCondor.cxx:440
TSystem::ClosePipe
virtual int ClosePipe(FILE *pipe)
Close the pipe.
Definition: TSystem.cxx:672
TString::Remove
TString & Remove(Ssiz_t pos)
Definition: TString.h:673
kFALSE
const Bool_t kFALSE
Definition: RtypesCore.h:92
TCondor::SetState
Bool_t SetState(EState state)
Set the state of workers.
Definition: TCondor.cxx:288
TCondor::TCondor
TCondor(const char *pool="")
Create Condor interface object.
Definition: TCondor.cxx:42
TCondor::kActive
@ kActive
Definition: TCondor.h:54
TSystem::Now
virtual TTime Now()
Get current time in milliseconds since 0:00 Jan 1 1995.
Definition: TSystem.cxx:464
line
TLine * line
Definition: entrylistblock_figure1.C:235
TCondor.h
TSystem::GetUid
virtual Int_t GetUid(const char *user=nullptr)
Returns the user's id. If user = 0, returns current user's id.
Definition: TSystem.cxx:1558
TCondor::fClaims
TList * fClaims
Definition: TCondor.h:61
TRegexp
Definition: TRegexp.h:31
TString::Index
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:639
gSystem
R__EXTERN TSystem * gSystem
Definition: TSystem.h:559
TCondor::fValid
Bool_t fValid
Definition: TCondor.h:58
TSystem::Getenv
virtual const char * Getenv(const char *env)
Get environment variable.
Definition: TSystem.cxx:1661
TSystem::TempDirectory
virtual const char * TempDirectory() const
Return a user configured or systemwide directory to create temporary files in.
Definition: TSystem.cxx:1480
TList::Remove
virtual TObject * Remove(TObject *obj)
Remove object from the list.
Definition: TList.cxx:821
TCondor::Resume
Bool_t Resume()
Resume worker.
Definition: TCondor.cxx:343
TList::Add
virtual void Add(TObject *obj)
Definition: TList.h:87
TCondor::fState
EState fState
Definition: TCondor.h:60
TIter
Definition: TCollection.h:233
TCondor::kSuspended
@ kSuspended
Definition: TCondor.h:54
Riostream.h
TCondorSlave::fClaimID
TString fClaimID
Definition: TCondor.h:40
TCondorSlave::fHostname
TString fHostname
Definition: TCondor.h:36
TSystem::OpenPipe
virtual FILE * OpenPipe(const char *command, const char *mode)
Open a pipe.
Definition: TSystem.cxx:663
TCondor::Claim
TList * Claim(Int_t n, const char *cmd)
Claim n virtual machines This function figures out the image and performance index before returning t...
Definition: TCondor.cxx:233
TCondor::fPool
TString fPool
Definition: TCondor.h:59
TList
Definition: TList.h:44
int