ROOT  6.06/09
Reference Guide
TS3WebFile.cxx
Go to the documentation of this file.
1 // @(#)root/net:$Id$
2 // Author: Fabio Hernandez 22/01/2013
3 // extending an initial version by Marcelo Sousa (class TAS3File)
4 
5 /*************************************************************************
6  * Copyright (C) 1995-2011, Rene Brun and Fons Rademakers. *
7  * All rights reserved. *
8  * *
9  * For the licensing terms see $ROOTSYS/LICENSE. *
10  * For the list of contributors see $ROOTSYS/README/CREDITS. *
11  *************************************************************************/
12 
13 //////////////////////////////////////////////////////////////////////////
14 // //
15 // TS3WebFile //
16 // //
17 // A TS3WebFile is a TWebFile which retrieves the file contents from a //
18 // web server implementing the REST API of the Amazon S3 protocol. This //
19 // class is meant to be as generic as possible to be used with files //
20 // hosted not only by Amazon S3 servers but also by other providers //
21 // implementing the core of the S3 protocol. //
22 // //
23 // The S3 protocol works on top of HTTPS (and HTTP) and imposes that //
24 // each HTTP request be signed using a specific convention: the request //
25 // must include an 'Authorization' header which contains the signature //
26 // of a concatenation of selected request fields. For signing the //
27 // request, an 'Access Key Id' and a 'Secret Access Key' need to be //
28 // known. These keys are used by the S3 servers to identify the client //
29 // and to authenticate the request as genuine. //
30 // //
31 // As an end user, you must know the Access Key and Secret Access Key //
32 // in order to access each S3 file. They are provided to you by your S3 //
33 // service provider. Those two keys can be provided to ROOT when //
34 // initializing an object of this class by two means: //
35 // a) by using the environmental variables S3_ACCESS_KEY and //
36 // S3_SECRET_KEY, or //
37 // b) by specifying them when opening each file. //
38 // //
39 // The first method is convenient if all the S3 files you want to //
40 // access are hosted by a single provider. The second one is more //
41 // flexible as it allows you to specify which credentials to use //
42 // on a per-file basis. See the documentation of the constructor of //
43 // this class for details on the syntax. //
44 // //
45 // For generating and signing the HTTP request, this class uses //
46 // TS3HTTPRequest. //
47 // //
48 // For more information on the details of S3 protocol please refer to: //
49 // "Amazon Simple Storage Service Developer Guide": //
50 // http://docs.amazonwebservices.com/AmazonS3/latest/dev/Welcome.html //
51 // //
52 // "Amazon Simple Storage Service REST API Reference" //
53 // http://docs.amazonwebservices.com/AmazonS3/latest/API/APIRest.html //
54 //////////////////////////////////////////////////////////////////////////
55 
56 #include "TS3WebFile.h"
57 #include "TROOT.h"
58 #include "TError.h"
59 #include "TSystem.h"
60 #include "TPRegexp.h"
61 #include "TEnv.h"
62 
63 
65 
66 ////////////////////////////////////////////////////////////////////////////////
67 /// Construct a TS3WebFile object. The path argument is a URL of one of the
68 /// following forms:
69 ///
70 /// s3://host.example.com/bucket/path/to/my/file
71 /// s3http://host.example.com/bucket/path/to/my/file
72 /// s3https://host.example.com/bucket/path/to/my/file
73 /// as3://host.example.com/bucket/path/to/my/file
74 ///
75 /// For files hosted by Google Storage, use the following forms:
76 ///
77 /// gs://storage.googleapis.com/bucket/path/to/my/file
78 /// gshttp://storage.googleapis.com/bucket/path/to/my/file
79 /// gsthttps://storage.googleapis.com/bucket/path/to/my/file
80 ///
81 /// The 'as3' scheme is accepted for backwards compatibility but its usage is
82 /// deprecated.
83 ///
84 /// The recommended way to create an instance of this class is through
85 /// TFile::Open, for instance:
86 ///
87 /// TFile* f1 = TFile::Open("s3://host.example.com/bucket/path/to/my/file")
88 /// TFile* f2 = TFile::Open("gs://storage.googleapis.com/bucket/path/to/my/file")
89 ///
90 /// The specified scheme (i.e. s3, s3http, s3https, ...) determines the underlying
91 /// transport protocol to use for downloading the file contents, namely HTTP or HTTPS.
92 /// The 's3', 's3https', 'gs' and 'gshttps' schemes imply using HTTPS as the transport
93 /// protocol. The 's3http', 'as3' and 'gshttp' schemes imply using HTTP as the transport
94 /// protocol.
95 ///
96 /// The 'options' argument can contain 'NOPROXY' if you want to bypass
97 /// the HTTP proxy when retrieving this file's contents. As for any TWebFile-derived
98 /// object, the URL of the web proxy can be specified by setting an environmental
99 /// variable 'http_proxy'. If this variable is set, we ask that proxy to route our
100 /// requests HTTP(S) requests to the file server.
101 ///
102 /// In addition, you can also use the 'options' argument to provide the access key
103 /// and secret key to be used for authentication purposes for this file by using a
104 /// string of the form "AUTH=myAccessKey:mySecretkey". This may be useful to
105 /// open several files hosted by different providers in the same program/macro,
106 /// where the environemntal variables solution is not convenient (see below).
107 ///
108 /// If you need to specify both NOPROXY and AUTH separate them by ' '
109 /// (blank), for instance:
110 /// "NOPROXY AUTH=F38XYZABCDeFgH4D0E1F:V+frt4re7J1euSNFnmaf8wwmI4AAAE7kzxZ/TTM+"
111 ///
112 /// Examples:
113 /// TFile* f1 = TFile::Open("s3://host.example.com/bucket/path/to/my/file",
114 /// "NOPROXY AUTH=F38XYZABCDeFgH4D0E1F:V+frt4re7J1euSNFnmaf8wwmI4AAAE7kzxZ/TTM+");
115 /// TFile* f2 = TFile::Open("s3://host.example.com/bucket/path/to/my/file",
116 /// "AUTH=F38XYZABCDeFgH4D0E1F:V+frt4re7J1euSNFnmaf8wwmI4AAAE7kzxZ/TTM+");
117 ///
118 /// If there is no authentication information in the 'options' argument
119 /// (i.e. not AUTH="....") the values of the environmental variables
120 /// S3_ACCESS_KEY and S3_SECRET_KEY (if set) are expected to contain
121 /// the access key id and the secret access key, respectively. You have
122 /// been provided with these credentials by your S3 service provider.
123 ///
124 /// If neither the AUTH information is provided in the 'options' argument
125 /// nor the environmental variables are set, we try to open the file
126 /// without providing any authentication information to the server. This
127 /// is useful when the file is set an access control that allows for
128 /// any unidentified user to read the file.
129 
130 TS3WebFile::TS3WebFile(const char* path, Option_t* options)
131  : TWebFile(path, "IO")
132 {
133  // Make sure this is a valid S3 path. We accept 'as3' as a scheme, for
134  // backwards compatibility
135  Bool_t doMakeZombie = kFALSE;
136  TString errorMsg;
137  TString accessKey;
138  TString secretKey;
139  TPMERegexp rex("^([a]?s3|s3http[s]?|gs|gshttp[s]?){1}://([^/]+)/([^/]+)/([^/].*)", "i");
140  if (rex.Match(TString(path)) != 5) {
141  errorMsg = TString::Format("invalid S3 path '%s'", path);
142  doMakeZombie = kTRUE;
143  }
144  else if (!ParseOptions(options, accessKey, secretKey)) {
145  errorMsg = TString::Format("could not parse options '%s'", options);
146  doMakeZombie = kTRUE;
147  }
148 
149  // Should we stop initializing this object?
150  if (doMakeZombie) {
151  Error("TS3WebFile", "%s", (const char*)errorMsg);
152  MakeZombie();
153  gDirectory = gROOT;
154  return;
155  }
156 
157  // Set this S3 object's URL, the bucket name this file is located in
158  // and the object key
159  fS3Request.SetBucket(rex[3]);
160  fS3Request.SetObjectKey(TString::Format("/%s", (const char*)rex[4]));
161 
162  // Initialize super-classes data members (fUrl is a data member of
163  // super-super class TFile)
164  TString protocol = "https";
165  if (rex[1].EndsWith("http", TString::kIgnoreCase) ||
166  rex[1].EqualTo("as3", TString::kIgnoreCase))
167  protocol = "http";
168  fUrl.SetUrl(TString::Format("%s://%s/%s/%s", (const char*)protocol,
169  (const char*)rex[2], (const char*)rex[3], (const char*)rex[4]));
170 
171  // Set S3-specific data members. If the access and secret keys are not
172  // provided in the 'options' argument we look in the environmental
173  // variables.
174  const char* kAccessKeyEnv = "S3_ACCESS_KEY";
175  const char* kSecretKeyEnv = "S3_SECRET_KEY";
176  if (accessKey.IsNull())
177  GetCredentialsFromEnv(kAccessKeyEnv, kSecretKeyEnv, accessKey, secretKey);
178 
179  // Initialize the S3 HTTP request
180  fS3Request.SetHost(fUrl.GetHost());
181  if (accessKey.IsNull() || secretKey.IsNull()) {
182  // We have no authentication information, neither in the options
183  // nor in the enviromental variables. So may be this is a
184  // world-readable file, so let's continue and see if
185  // we can open it.
186  fS3Request.SetAuthType(TS3HTTPRequest::kNoAuth);
187  } else {
188  // Set the authentication information we need to use
189  // for this file
190  fS3Request.SetAuthKeys(accessKey, secretKey);
191  if (rex[1].BeginsWith("gs"))
192  fS3Request.SetAuthType(TS3HTTPRequest::kGoogle);
193  else
194  fS3Request.SetAuthType(TS3HTTPRequest::kAmazon);
195  }
196 
197  // Assume this server does not serve multi-range HTTP GET requests. We
198  // will detect this when the HTTP headers of this files are retrieved
199  // later in the initialization process
200  fUseMultiRange = kFALSE;
201 
202  // Call super-class initializer
204 
205  // Were there some errors opening this file?
206  if (IsZombie() && (accessKey.IsNull() || secretKey.IsNull())) {
207  // We could not open the file and we have no authentication information
208  // so inform the user so that they can check.
209  Error("TS3WebFile", "could not find authentication info in "\
210  "'options' argument and at least one of the environment variables '%s' or '%s' is not set",
211  kAccessKeyEnv, kSecretKeyEnv);
212  }
213 }
214 
215 
216 ////////////////////////////////////////////////////////////////////////////////
217 /// Extracts the S3 authentication key pair (access key and secret key)
218 /// from the options. The authentication credentials can be specified in
219 /// the options provided to the constructor of this class as a string
220 /// containing: "AUTH=<access key>:<secret key>" and can include other
221 /// options, for instance "NOPROXY" for not using the HTTP proxy for
222 /// accessing this file's contents.
223 /// For instance:
224 /// "NOPROXY AUTH=F38XYZABCDeFgHiJkLm:V+frt4re7J1euSNFnmaf8wwmI401234E7kzxZ/TTM+"
225 
226 Bool_t TS3WebFile::ParseOptions(Option_t* options, TString& accessKey, TString& secretKey)
227 {
228  TString optStr = (const char*)options;
229  if (optStr.IsNull())
230  return kTRUE;
231 
232  fNoProxy = kFALSE;
233  if (optStr.Contains("NOPROXY", TString::kIgnoreCase))
234  fNoProxy = kTRUE;
235  CheckProxy();
236 
237  // Look in the options string for the authentication information.
238  TPMERegexp rex("(^AUTH=|^.* AUTH=)([a-z0-9]+):([a-z0-9+/]+)[\\s]*.*$", "i");
239  if (rex.Match(optStr) < 4) {
240  Error("ParseOptions", "expecting options of the form \"AUTH=myAccessKey:mySecretKey\"");
241  return kFALSE;
242  }
243  accessKey = rex[2];
244  secretKey = rex[3];
245  if (gDebug > 0)
246  Info("ParseOptions", "using authentication information from 'options' argument");
247  return kTRUE;
248 }
249 
250 
251 ////////////////////////////////////////////////////////////////////////////////
252 /// Overwrites TWebFile::GetHead() for retrieving the HTTP headers of this
253 /// file. Uses TS3HTTPRequest to generate an HTTP HEAD request which includes
254 /// the authorization header expected by the S3 server.
255 
257 {
259  return TWebFile::GetHead();
260 }
261 
262 
263 ////////////////////////////////////////////////////////////////////////////////
264 /// Overwrites TWebFile::SetMsgReadBuffer10() for setting the HTTP GET
265 /// request compliant to the authentication mechanism used by the S3
266 /// protocol. The GET request must contain an "Authorization" header with
267 /// the signature of the request, generated using the user's secret access
268 /// key.
269 
270 void TS3WebFile::SetMsgReadBuffer10(const char* redirectLocation, Bool_t tempRedirect)
271 {
272  TWebFile::SetMsgReadBuffer10(redirectLocation, tempRedirect);
274  return;
275 }
276 
277 
278 ////////////////////////////////////////////////////////////////////////////////
279 
280 Bool_t TS3WebFile::ReadBuffers(char* buf, Long64_t* pos, Int_t* len, Int_t nbuf)
281 {
282  // Overwrites TWebFile::ReadBuffers() for reading specified byte ranges.
283  // According to the kind of server this file is hosted by, we use a
284  // single HTTP request with a muti-range header or we generate multiple
285  // requests with a single range each.
286 
287  // Does this server support multi-range GET requests?
288  if (fUseMultiRange)
289  return TWebFile::ReadBuffers(buf, pos, len, nbuf);
290 
291  // Send multiple GET requests with a single range of bytes
292  // Adapted from original version by Wang Lu
293  for (Int_t i=0, offset=0; i < nbuf; i++) {
294  TString rangeHeader = TString::Format("Range: bytes=%lld-%lld\r\n\r\n",
295  pos[i], pos[i] + len[i] - 1);
296  TString s3Request = fS3Request.GetRequest(TS3HTTPRequest::kGET, kFALSE) + rangeHeader;
297  if (GetFromWeb10(&buf[offset], len[i], s3Request) == -1)
298  return kTRUE;
299  offset += len[i];
300  }
301  return kFALSE;
302 }
303 
304 
305 ////////////////////////////////////////////////////////////////////////////////
306 /// This method is called by the super-class TWebFile when a HTTP header
307 /// for this file is retrieved. We scan the 'Server' header to detect the
308 /// type of S3 server this file is hosted on and to determine if it is
309 /// known to support multi-range HTTP GET requests. Some S3 servers (for
310 /// instance Amazon's) do not support that feature and when they
311 /// receive a multi-range request they sent back the whole file contents.
312 /// For this class, if the server do not support multirange requests
313 /// we issue multiple single-range requests instead.
314 
315 void TS3WebFile::ProcessHttpHeader(const TString& headerLine)
316 {
317  TPMERegexp rex("^Server: (.+)", "i");
318  if (rex.Match(headerLine) != 2)
319  return;
320 
321  // Extract the identity of this server and compare it to the
322  // identify of the servers known to support multi-range requests.
323  // The list of server identities is expected to be found in ROOT
324  // configuration.
325  TString serverId = rex[1].ReplaceAll("\r", "").ReplaceAll("\n", "");
326  TString multirangeServers(gEnv->GetValue("TS3WebFile.Root.MultiRangeServer", ""));
327  fUseMultiRange = multirangeServers.Contains(serverId, TString::kIgnoreCase) ? kTRUE : kFALSE;
328 }
329 
330 
331 ////////////////////////////////////////////////////////////////////////////////
332 /// Sets the access and secret keys from the environmental variables, if
333 /// they are both set.
334 
335 Bool_t TS3WebFile::GetCredentialsFromEnv(const char* accessKeyEnv, const char* secretKeyEnv,
336  TString& outAccessKey, TString& outSecretKey)
337 {
338  // Look first in the recommended environmental variables. Both variables
339  // must be set.
340  TString accKey = gSystem->Getenv(accessKeyEnv);
341  TString secKey = gSystem->Getenv(secretKeyEnv);
342  if (!accKey.IsNull() && !secKey.IsNull()) {
343  outAccessKey = accKey;
344  outSecretKey = secKey;
345  if (gDebug > 0)
346  Info("GetCredentialsFromEnv", "using authentication information from environmental variables '%s' and '%s'",
347  accessKeyEnv, secretKeyEnv);
348  return kTRUE;
349  }
350 
351  // Look now in the legacy environmental variables, for keeping backwards
352  // compatibility.
353  accKey = gSystem->Getenv("S3_ACCESS_ID"); // Legacy access key
354  secKey = gSystem->Getenv("S3_ACCESS_KEY"); // Legacy secret key
355  if (!accKey.IsNull() && !secKey.IsNull()) {
356  Warning("SetAuthKeys", "usage of S3_ACCESS_ID and S3_ACCESS_KEY environmental variables is deprecated.");
357  Warning("SetAuthKeys", "please use S3_ACCESS_KEY and S3_SECRET_KEY environmental variables.");
358  outAccessKey = accKey;
359  outSecretKey = secKey;
360  return kTRUE;
361  }
362 
363  return kFALSE;
364 }
365 
virtual Int_t GetHead()
Overwrites TWebFile::GetHead() for retrieving the HTTP headers of this file.
Definition: TS3WebFile.cxx:256
TS3HTTPRequest fS3Request
Definition: TS3WebFile.h:95
long long Long64_t
Definition: RtypesCore.h:69
const char Option_t
Definition: RtypesCore.h:62
#define gDirectory
Definition: TDirectory.h:218
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition: TObject.cxx:892
virtual void SetMsgReadBuffer10(const char *redirectLocation=0, Bool_t tempRedirect=kFALSE)
Set GET command for use by ReadBuffer(s)10(), handle redirection if needed.
Definition: TWebFile.cxx:257
#define gROOT
Definition: TROOT.h:340
Basic string class.
Definition: TString.h:137
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
virtual Int_t GetFromWeb10(char *buf, Int_t len, const TString &msg)
Read multiple byte range request from web server.
Definition: TWebFile.cxx:638
ClassImp(TS3WebFile) TS3WebFile
Construct a TS3WebFile object.
Definition: TS3WebFile.cxx:64
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2334
Bool_t fNoProxy
Definition: TWebFile.h:53
virtual const char * Getenv(const char *env)
Get environment variable.
Definition: TSystem.cxx:1627
bool BeginsWith(const std::string &theString, const std::string &theSubstring)
virtual Bool_t ReadBuffers(char *buf, Long64_t *pos, Int_t *len, Int_t nbuf)
Read specified byte ranges from remote file via HTTP daemon.
Definition: TWebFile.cxx:487
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:918
void Error(const char *location, const char *msgfmt,...)
R__EXTERN TSystem * gSystem
Definition: TSystem.h:549
TString GetRequest(TS3HTTPRequest::EHTTPVerb httpVerb, Bool_t appendCRLF=kTRUE)
Returns the HTTP request ready to be sent to the server.
virtual Int_t GetValue(const char *name, Int_t dflt)
Returns the integer value for a resource.
Definition: TEnv.cxx:494
Bool_t GetCredentialsFromEnv(const char *accessKeyEnv, const char *secretKeyEnv, TString &outAccessKey, TString &outSecretKey)
Sets the access and secret keys from the environmental variables, if they are both set...
Definition: TS3WebFile.cxx:335
Bool_t IsNull() const
Definition: TString.h:387
Bool_t fUseMultiRange
Definition: TS3WebFile.h:96
virtual void SetMsgReadBuffer10(const char *redirectLocation=0, Bool_t tempRedirect=kFALSE)
Overwrites TWebFile::SetMsgReadBuffer10() for setting the HTTP GET request compliant to the authentic...
Definition: TS3WebFile.cxx:270
virtual void CheckProxy()
Check if shell var "http_proxy" has been set and should be used.
Definition: TWebFile.cxx:342
virtual void ProcessHttpHeader(const TString &headerLine)
This method is called by the super-class TWebFile when a HTTP header for this file is retrieved...
Definition: TS3WebFile.cxx:315
R__EXTERN TEnv * gEnv
Definition: TEnv.h:174
Bool_t ParseOptions(Option_t *options, TString &accessKey, TString &secretKey)
Extracts the S3 authentication key pair (access key and secret key) from the options.
Definition: TS3WebFile.cxx:226
bool EndsWith(const std::string &theString, const std::string &theSubstring)
TString fMsgReadBuffer10
Definition: TWebFile.h:55
Int_t Match(const TString &s, UInt_t start=0)
Runs a match on s against the regex 'this' was created with.
Definition: TPRegexp.cxx:704
TString fMsgGetHead
Definition: TWebFile.h:56
Wrapper for PCRE library (Perl Compatible Regular Expressions).
Definition: TPRegexp.h:103
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:567
virtual Bool_t ReadBuffers(char *buf, Long64_t *pos, Int_t *len, Int_t nbuf)
Read specified byte ranges from remote file via HTTP daemon.
Definition: TS3WebFile.cxx:280
virtual Int_t GetHead()
Get the HTTP header.
Definition: TWebFile.cxx:893
virtual void Init(Bool_t readHeadOnly)
Initialize a TWebFile object.
Definition: TWebFile.cxx:202
R__EXTERN Int_t gDebug
Definition: Rtypes.h:128
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition: TObject.cxx:904