ROOT  6.06/09
Reference Guide
THDFSFile.cxx
Go to the documentation of this file.
1 // @(#)root/hdfs:$Id$
2 // Author: Brian Bockelman 29/09/2009
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2002, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 /**
13 \class THDFSFile
14 \ingroup IO
15 
16 Reads and writes its data via the HDFS protocols
17 
18 A THDFSFile is like a normal TFile except that it reads and writes
19 its data via the HDFS protocols. For more information on HDFS, see
20 http://hadoop.apache.org/hdfs/.
21 This implementation interfaces with libhdfs, which is a JNI-based
22 library (i.e., it will start a Java JVM internally the first time
23 it is called). At a minimum, you will need your environment's
24 $CLASSPATH variable set up properly to use. Here's an example of
25 one way to properly set your classpath, assuming you use the OSG
26 distribution of Hadoop:
27  $ source $HADOOP_CONF_DIR/hadoop-env.sh
28  $ export CLASSPATH=$HADOOP_CLASSPATH
29 Additionally, you will need a valid libjvm in your $LD_LIBRARY_PATH
30 This is usually found in either:
31  $JAVA_HOME/jre/lib/i386/server
32 or
33  $JAVA_HOME/jre/lib/amd64/server
34 This file can only be used if hdfs support is compiled into ROOT.
35 The HDFS URLs should be of the form:
36  hdfs:///path/to/file/in/HDFS.root
37 Any host or port information will be ignored; this is taken from the
38 node's HDFS configuration files.
39 */
40 
41 #include "syslog.h"
42 #include "assert.h"
43 #include "stdlib.h"
44 
45 #include "THDFSFile.h"
46 #include "TError.h"
47 #include "TSystem.h"
48 #include "TROOT.h"
49 
50 #include "hdfs.h"
51 //#include "hdfsJniHelper.h"
52 
53 // For now, we don't allow any write/fs modification operations.
55 
56 // The following snippet is used for developer-level debugging
57 // Contributed by Pete Wyckoff of the HDFS project
58 #define THDFSFile_TRACE
59 #ifndef THDFSFile_TRACE
60 #define TRACE(x) \
61  Debug("THDFSFile", "%s", x);
62 #else
63 #define TRACE(x);
64 #endif
65 
67 
68 ////////////////////////////////////////////////////////////////////////////////
69 /// Usual Constructor. See the TFile constructor for details.
70 
71 THDFSFile::THDFSFile(const char *path, Option_t *option,
72  const char *ftitle, Int_t compress):
73  TFile(path, "WEB", ftitle, compress)
74 {
75  fHdfsFH = 0;
76  fFS = 0;
77  fSize = -1;
78  fPath = 0;
79  fSysOffset = 0;
80 
81  fOption = option;
82  fOption.ToUpper();
83  Bool_t create = (fOption == "CREATE") ? kTRUE : kFALSE;
84  Bool_t recreate = (fOption == "RECREATE") ? kTRUE : kFALSE;
85  Bool_t update = (fOption == "UPDATE") ? kTRUE : kFALSE;
86  Bool_t read = (fOption == "READ") ? kTRUE : kFALSE;
87  if (!create && !recreate && !update && !read) {
88  read = kTRUE;
89  fOption = "READ";
90  }
91 
92  Bool_t has_authn = kTRUE;
93 
94  if (has_authn) {
95  UserGroup_t *ugi = gSystem->GetUserInfo(0);
96  const char *user = (ugi->fUser).Data();
97  fFS = hdfsConnectAsUser("default", 0, user);
98  delete ugi;
99  } else {
100  fFS = hdfsConnect("default", 0);
101  }
102 
103  if (fFS == 0) {
104  SysError("THDFSFile", "HDFS client for %s cannot open the filesystem",
105  path);
106  goto zombie;
107  }
108 
109  if (create || update || recreate) {
110  Int_t mode = O_RDWR | O_CREAT;
111  if (recreate) mode |= O_TRUNC;
112 
113 #ifndef WIN32
114  fD = SysOpen(path, mode, 0644);
115 #else
116  fD = SysOpen(path, mode | O_BINARY, S_IREAD | S_IWRITE);
117 #endif
118  if (fD == -1) {
119  SysError("THDFSFile", "file %s can not be opened", path);
120  goto zombie;
121  }
122  fWritable = kTRUE;
123  } else {
124 #ifndef WIN32
125  fD = SysOpen(path, O_RDONLY, 0644);
126 #else
127  fD = SysOpen(path, O_RDONLY | O_BINARY, S_IREAD | S_IWRITE);
128 #endif
129  if (fD == -1) {
130  SysError("THDFSFile", "file %s can not be opened for reading", path);
131  goto zombie;
132  }
133  fWritable = kFALSE;
134  }
135 
136  Init(create || recreate);
137 
138  return;
139 
140 zombie:
141  // Error in opening file; make this a zombie
142  MakeZombie();
143  gDirectory = gROOT;
144 }
145 
146 ////////////////////////////////////////////////////////////////////////////////
147 /// Close and clean-up HDFS file.
148 
150 {
151  TRACE("destroy")
152 
153  if (fPath)
154  delete [] fPath;
155 
156  // We assume that the file is closed in SysClose
157  // Explicitly release reference to HDFS filesystem object.
158  // Turned off now due to compilation issues.
159  // The very awkward way of releasing HDFS FS objects (by accessing JNI
160  // internals) is going away in the next libhdfs version.
161 }
162 
163 ////////////////////////////////////////////////////////////////////////////////
164 /// Read specified number of bytes from current offset into the buffer.
165 /// See documentation for TFile::SysRead().
166 
168 {
169  TRACE("READ")
170  tSize num_read = hdfsPread((hdfsFS)fFS, (hdfsFile)fHdfsFH, fSysOffset, buf, len);
171  fSysOffset += len;
172  if (num_read < 0) {
173  gSystem->SetErrorStr(strerror(errno));
174  }
175  return num_read;
176 }
177 
178 ////////////////////////////////////////////////////////////////////////////////
179 /// Seek to a specified position in the file. See TFile::SysSeek().
180 /// Note that THDFSFile does not support seeks when the file is open for write.
181 
183 {
184  TRACE("SEEK")
185  if (whence == SEEK_SET)
186  fSysOffset = offset;
187  else if (whence == SEEK_CUR)
188  fSysOffset += offset;
189  else if (whence == SEEK_END) {
190  if (offset > 0) {
191  SysError("THDFSFile", "Unable to seek past end of file");
192  return -1;
193  }
194  if (fSize == -1) {
195  hdfsFileInfo *info = hdfsGetPathInfo((hdfsFS)fFS, fPath);
196  if (info != 0) {
197  fSize = info->mSize;
198  free(info);
199  } else {
200  SysError("THDFSFile", "Unable to seek to end of file");
201  return -1;
202  }
203  }
204  fSysOffset = fSize;
205  } else {
206  SysError("THDFSFile", "Unknown whence!");
207  return -1;
208  }
209  return fSysOffset;
210 }
211 
212 ////////////////////////////////////////////////////////////////////////////////
213 /// Open a file in HDFS.
214 
215 Int_t THDFSFile::SysOpen(const char * pathname, Int_t flags, UInt_t)
216 {
217  // This is given to us as a URL (hdfs://hadoop-name:9000//foo or
218  // hdfs:///foo); convert this to a file name.
219  TUrl url(pathname);
220  const char * file = url.GetFile();
221  size_t path_size = strlen(file);
222  fPath = new char[path_size+1];
223  if (fPath == 0) {
224  SysError("THDFSFile", "Unable to allocate memory for path.");
225  }
226  strlcpy(fPath, file,path_size+1);
227  if ((fHdfsFH = hdfsOpenFile((hdfsFS)fFS, fPath, flags, 0, 0, 0)) == 0) {
228  SysError("THDFSFile", "Unable to open file %s in HDFS", pathname);
229  return -1;
230  }
231  return 1;
232 }
233 
234 ////////////////////////////////////////////////////////////////////////////////
235 /// Close the file in HDFS.
236 
238 {
239  int result = hdfsCloseFile((hdfsFS)fFS, (hdfsFile)fHdfsFH);
240  fFS = 0;
241  fHdfsFH = 0;
242  return result;
243 }
244 
245 ////////////////////////////////////////////////////////////////////////////////
246 /// Write a buffer into the file; this is not supported currently.
247 
249 {
250  errno = ENOSYS;
251  return -1;
252 }
253 
254 ////////////////////////////////////////////////////////////////////////////////
255 /// Perform a stat on the HDFS file; see TFile::SysStat().
256 
257 Int_t THDFSFile::SysStat(Int_t, Long_t* id, Long64_t* size, Long_t* flags, Long_t* modtime)
258 {
259  *id = ::Hash(fPath);
260 
261  hdfsFileInfo *info = hdfsGetPathInfo((hdfsFS)fFS, fPath);
262  if (info != 0) {
263  fSize = info->mSize;
264  *size = fSize;
265  if (info->mKind == kObjectKindFile)
266  *flags = 0;
267  else if (info->mKind == kObjectKindDirectory)
268  *flags = 1;
269  *modtime = info->mLastMod;
270  free(info);
271  } else {
272  return 1;
273  }
274 
275  return 0;
276 }
277 
278 ////////////////////////////////////////////////////////////////////////////////
279 /// Sync remaining data to disk; Not supported by HDFS.
280 
282 {
283  errno = ENOSYS;
284  return -1;
285 }
286 
287 ////////////////////////////////////////////////////////////////////////////////
288 /// ResetErrno; simply calls TSystem::ResetErrno().
289 
291 {
293 }
294 
295 
296 /**
297 \class THDFSSystem
298 \ingroup IO
299 
300 Directory handler for HDFS (THDFSFile).
301 */
302 
303 
305 
306 ////////////////////////////////////////////////////////////////////////////////
307 
308 THDFSSystem::THDFSSystem() : TSystem("-hdfs", "HDFS Helper System")
309 {
310  SetName("hdfs");
311 
312  Bool_t has_authn = kTRUE;
313 
314  if (has_authn) {
315  UserGroup_t *ugi = gSystem->GetUserInfo(0);
316  const char *user = (ugi->fUser).Data();
317  fFH = hdfsConnectAsUser("default", 0, user);
318  delete ugi;
319  } else {
320  fFH = hdfsConnect("default", 0);
321  }
322 
323  if (fFH == 0) {
324  SysError("THDFSSystem", "HDFS client cannot open the filesystem");
325  goto zombie;
326  }
327 
328  fDirp = 0;
329 
330  return;
331 
332 zombie:
333  // Error in opening file; make this a zombie
334  MakeZombie();
335  gDirectory = gROOT;
336 
337 }
338 
339 ////////////////////////////////////////////////////////////////////////////////
340 /// Make a directory.
341 
343 {
344  if (fFH != 0) {
345  Error("MakeDirectory", "No filesystem handle (should never happen)");
346  return -1;
347  }
348 
349  if (R__HDFS_ALLOW_CHANGES == kTRUE) {
350  return hdfsCreateDirectory((hdfsFS)fFH, path);
351  } else {
352  return -1;
353  }
354 
355 }
356 
357 ////////////////////////////////////////////////////////////////////////////////
358 /// Open a directory via hdfs. Returns an opaque pointer to a dir
359 /// structure. Returns 0 in case of error.
360 
361 void *THDFSSystem::OpenDirectory(const char * path)
362 {
363  if (fFH == 0) {
364  Error("OpenDirectory", "No filesystem handle (should never happen)");
365  return 0;
366  }
367 
368  fDirp = 0;
369 /*
370  if (fDirp) {
371  Error("OpenDirectory", "invalid directory pointer (should never happen)");
372  fDirp = 0;
373  }
374 */
375 
376  hdfsFileInfo * dir = 0;
377  if ((dir = hdfsGetPathInfo((hdfsFS)fFH, path)) == 0) {
378  return 0;
379  }
380  if (dir->mKind != kObjectKindDirectory) {
381  return 0;
382  }
383 
384  fDirp = (void *)hdfsListDirectory((hdfsFS)fFH, path, &fDirEntries);
385  fDirCtr = 0;
386 
387  fUrlp = new TUrl[fDirEntries];
388 
389  return fDirp;
390 }
391 
392 ////////////////////////////////////////////////////////////////////////////////
393 /// Free directory via httpd.
394 
396 {
397  if (fFH == 0) {
398  Error("FreeDirectory", "No filesystem handle (should never happen)");
399  return;
400  }
401  if (dirp != fDirp) {
402  Error("FreeDirectory", "invalid directory pointer (should never happen)");
403  return;
404  }
405  if (fUrlp != 0) {
406  delete fUrlp;
407  }
408 
409  hdfsFreeFileInfo((hdfsFileInfo *)fDirp, fDirEntries);
410  fDirp=0;
411 }
412 
413 ////////////////////////////////////////////////////////////////////////////////
414 /// Get directory entry via httpd. Returns 0 in case no more entries.
415 
416 const char *THDFSSystem::GetDirEntry(void *dirp)
417 {
418  if (fFH == 0) {
419  Error("GetDirEntry", "No filesystem handle (should never happen)");
420  return 0;
421  }
422  if (dirp != fDirp) {
423  Error("GetDirEntry", "invalid directory pointer (should never happen)");
424  return 0;
425  }
426  if (dirp == 0) {
427  Error("GetDirEntry", "Passed an invalid directory pointer.");
428  return 0;
429  }
430 
431  if (fDirCtr == fDirEntries-1) {
432  return 0;
433  }
434 
435  hdfsFileInfo *fileInfo = ((hdfsFileInfo *)dirp) + fDirCtr;
436  fUrlp[fDirCtr].SetUrl(fileInfo->mName);
437  const char * result = fUrlp[fDirCtr].GetFile();
438  TUrl tempUrl;
439  tempUrl.SetUrl("hdfs:///");
440  tempUrl.SetFile(result);
441  fUrlp[fDirCtr].SetUrl(tempUrl.GetUrl());
442  result = fUrlp[fDirCtr].GetUrl();
443  fDirCtr++;
444 
445  return result;
446 }
447 
448 ////////////////////////////////////////////////////////////////////////////////
449 /// Get info about a file. Info is returned in the form of a FileStat_t
450 /// structure (see TSystem.h).
451 /// The function returns 0 in case of success and 1 if the file could
452 /// not be stat'ed.
453 
455 {
456  if (fFH == 0) {
457  Error("GetPathInfo", "No filesystem handle (should never happen)");
458  return 1;
459  }
460  hdfsFileInfo *fileInfo = hdfsGetPathInfo((hdfsFS)fFH, path);
461 
462  if (fileInfo == 0)
463  return 1;
464 
465  buf.fDev = 0;
466  buf.fIno = 0;
467  buf.fMode = fileInfo->mPermissions;
468  buf.fUid = gSystem->GetUid(fileInfo->mOwner);
469  buf.fGid = gSystem->GetGid(fileInfo->mGroup);
470  buf.fSize = fileInfo->mSize;
471  buf.fMtime = fileInfo->mLastAccess;
472  buf.fIsLink = kFALSE;
473 
474  return 0;
475 }
476 
477 ////////////////////////////////////////////////////////////////////////////////
478 /// Returns FALSE if one can access a file using the specified access mode.
479 /// Mode is the same as for the Unix access(2) function.
480 /// Attention, bizarre convention of return value!!
481 
483 {
484  if (mode & kExecutePermission || mode & kWritePermission)
485  return kTRUE;
486 
487  if (fFH == 0) {
488  Error("AccessPathName", "No filesystem handle (should never happen)");
489  return kTRUE;
490  }
491 
492  if (hdfsExists((hdfsFS)fFH, path) == 0)
493  return kFALSE;
494  else
495  return kTRUE;
496 }
497 
498 ////////////////////////////////////////////////////////////////////////////////
499 /// Unlink, i.e. remove, a file or directory. Returns 0 when successful,
500 /// -1 in case of failure.
501 
502 Int_t THDFSSystem::Unlink(const char * path)
503 {
504  if (fFH == 0) {
505  Error("Unlink", "No filesystem handle (should never happen)");
506  return kTRUE;
507  }
508 
509  if (R__HDFS_ALLOW_CHANGES == kTRUE) {
510  return hdfsDelete((hdfsFS)fFH, path, 1);
511  } else {
512  return -1;
513  }
514 }
void SetFile(const char *file)
Definition: TUrl.h:94
void * fDirp
Pointer to the array of file information.
Definition: THDFSFile.h:56
double read(const std::string &file_name)
reading
Int_t SysClose(Int_t fd)
Close the file in HDFS.
Definition: THDFSFile.cxx:237
Int_t SysSync(Int_t fd)
Sync remaining data to disk; Not supported by HDFS.
Definition: THDFSFile.cxx:281
long long Long64_t
Definition: RtypesCore.h:69
Bool_t AccessPathName(const char *path, EAccessMode mode)
Returns FALSE if one can access a file using the specified access mode.
Definition: THDFSFile.cxx:482
void * OpenDirectory(const char *name)
Open a directory via hdfs.
Definition: THDFSFile.cxx:361
const char Option_t
Definition: RtypesCore.h:62
virtual ULong_t Hash() const
Return hash value for this object.
Definition: TNamed.h:53
This class represents a WWW compatible URL.
Definition: TUrl.h:41
Int_t fUid
Definition: TSystem.h:139
static const Bool_t R__HDFS_ALLOW_CHANGES
Definition: THDFSFile.cxx:54
#define gDirectory
Definition: TDirectory.h:218
void SetUrl(const char *url, Bool_t defaultIsFile=kFALSE)
Parse url character string and split in its different subcomponents.
Definition: TUrl.cxx:108
void * fFS
HDFS user handle.
Definition: THDFSFile.h:27
A ROOT file is a suite of consecutive data records (TKey instances) with a well defined format...
Definition: TFile.h:45
virtual ~THDFSFile()
Close and clean-up HDFS file.
Definition: THDFSFile.cxx:149
#define gROOT
Definition: TROOT.h:340
#define O_BINARY
Definition: civetweb.c:273
Int_t SysStat(Int_t fd, Long_t *id, Long64_t *size, Long_t *flags, Long_t *modtime)
Perform a stat on the HDFS file; see TFile::SysStat().
Definition: THDFSFile.cxx:257
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
Int_t Unlink(const char *path)
Unlink, i.e.
Definition: THDFSFile.cxx:502
const Bool_t kFALSE
Definition: Rtypes.h:92
void FreeDirectory(void *dirp)
Free directory via httpd.
Definition: THDFSFile.cxx:395
Long_t fMtime
Definition: TSystem.h:142
Long64_t fSize
Definition: TSystem.h:141
void ResetErrno() const
ResetErrno; simply calls TSystem::ResetErrno().
Definition: THDFSFile.cxx:290
void SysError(const char *location, const char *msgfmt,...)
Int_t fMode
Definition: TSystem.h:138
ClassImp(THDFSFile) THDFSFile
Usual Constructor. See the TFile constructor for details.
Definition: THDFSFile.cxx:66
Vc_ALWAYS_INLINE void free(T *p)
Frees memory that was allocated with Vc::malloc.
Definition: memory.h:94
void Init(TClassEdit::TInterpreterLookupHelper *helper)
Definition: TClassEdit.cxx:118
Long64_t fSysOffset
Seek offset in file.
Definition: THDFSFile.h:29
std::vector< std::vector< double > > Data
virtual UserGroup_t * GetUserInfo(Int_t uid)
Returns all user info in the UserGroup_t structure.
Definition: TSystem.cxx:1563
Long64_t fSize
File size.
Definition: THDFSFile.h:28
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:918
Int_t fGid
Definition: TSystem.h:140
Reads and writes its data via the HDFS protocols.
Definition: THDFSFile.h:23
TString fUser
Definition: TSystem.h:152
Bool_t fIsLink
Definition: TSystem.h:143
R__EXTERN TSystem * gSystem
Definition: TSystem.h:549
static void update(gsl_integration_workspace *workspace, double a1, double b1, double area1, double error1, double a2, double b2, double area2, double error2)
unsigned int UInt_t
Definition: RtypesCore.h:42
Directory handler for HDFS (THDFSFile).
Definition: THDFSFile.h:52
Int_t fDirCtr
The current position in the fDirp array.
Definition: THDFSFile.h:59
void * fHdfsFH
HDFS file handle.
Definition: THDFSFile.h:26
Int_t SysOpen(const char *pathname, Int_t flags, UInt_t mode)
Open a file in HDFS.
Definition: THDFSFile.cxx:215
HelperImpl< VC_IMPL > Helper
Definition: global.h:478
const char * GetUrl(Bool_t withDeflt=kFALSE) const
Return full URL.
Definition: TUrl.cxx:385
long Long_t
Definition: RtypesCore.h:50
virtual void SysError(const char *method, const char *msgfmt,...) const
Issue system error message.
Definition: TObject.cxx:932
Long64_t SysSeek(Int_t fd, Long64_t offset, Int_t whence)
Seek to a specified position in the file.
Definition: THDFSFile.cxx:182
#define TRACE(x)
Definition: THDFSFile.cxx:63
virtual Int_t GetGid(const char *group=0)
Returns the group's id. If group = 0, returns current user's group.
Definition: TSystem.cxx:1543
char * fPath
HDFS path name.
Definition: THDFSFile.h:30
virtual Int_t GetUid(const char *user=0)
Returns the user's id. If user = 0, returns current user's id.
Definition: TSystem.cxx:1524
Int_t MakeDirectory(const char *name)
Make a directory.
Definition: THDFSFile.cxx:342
EAccessMode
Definition: TSystem.h:54
Int_t GetPathInfo(const char *path, FileStat_t &buf)
Get info about a file.
Definition: THDFSFile.cxx:454
TUrl * fUrlp
Pointer to the array of directory content URLs.
Definition: THDFSFile.h:57
const char * GetDirEntry(void *dirp)
Get directory entry via httpd. Returns 0 in case no more entries.
Definition: THDFSFile.cxx:416
Int_t SysRead(Int_t fd, void *buf, Int_t len)
Read specified number of bytes from current offset into the buffer.
Definition: THDFSFile.cxx:167
Long_t fIno
Definition: TSystem.h:137
void * fFH
HDFS filesystem handle.
Definition: THDFSFile.h:55
static void ResetErrno()
Static function resetting system error number.
Definition: TSystem.cxx:280
double result[121]
Int_t fDirEntries
The number of entries in the fDirp array.
Definition: THDFSFile.h:58
void SetErrorStr(const char *errstr)
Set the system error string.
Definition: TSystem.cxx:245
Long_t fDev
Definition: TSystem.h:136
Abstract base class defining a generic interface to the underlying Operating System.
Definition: TSystem.h:258
const Bool_t kTRUE
Definition: Rtypes.h:91
Int_t SysWrite(Int_t fd, const void *buf, Int_t len)
Write a buffer into the file; this is not supported currently.
Definition: THDFSFile.cxx:248
gr SetName("gr")
const char * GetFile() const
Definition: TUrl.h:78