Logo ROOT  
Reference Guide
TS3WebFile.cxx
Go to the documentation of this file.
1// @(#)root/net:$Id$
2// Author: Fabio Hernandez 22/01/2013
3// extending an initial version by Marcelo Sousa (class TAS3File)
4
5/*************************************************************************
6 * Copyright (C) 1995-2011, Rene Brun and Fons Rademakers. *
7 * All rights reserved. *
8 * *
9 * For the licensing terms see $ROOTSYS/LICENSE. *
10 * For the list of contributors see $ROOTSYS/README/CREDITS. *
11 *************************************************************************/
12
13//////////////////////////////////////////////////////////////////////////
14// //
15// TS3WebFile //
16// //
17// A TS3WebFile is a TWebFile which retrieves the file contents from a //
18// web server implementing the REST API of the Amazon S3 protocol. This //
19// class is meant to be as generic as possible to be used with files //
20// hosted not only by Amazon S3 servers but also by other providers //
21// implementing the core of the S3 protocol. //
22// //
23// The S3 protocol works on top of HTTPS (and HTTP) and imposes that //
24// each HTTP request be signed using a specific convention: the request //
25// must include an 'Authorization' header which contains the signature //
26// of a concatenation of selected request fields. For signing the //
27// request, an 'Access Key Id' and a 'Secret Access Key' need to be //
28// known. These keys are used by the S3 servers to identify the client //
29// and to authenticate the request as genuine. //
30// //
31// As an end user, you must know the Access Key and Secret Access Key //
32// in order to access each S3 file. They are provided to you by your S3 //
33// service provider. Those two keys can be provided to ROOT when //
34// initializing an object of this class by two means: //
35// a) by using the environmental variables S3_ACCESS_KEY and //
36// S3_SECRET_KEY, or //
37// b) by specifying them when opening each file. //
38// //
39// You can use AWS temporary security credentials (temporary access key //
40// and secret access key), but you must also give the associated //
41// session token. The token may be set in the S3_SESSION_TOKEN //
42// environmental variable, or on open in the TOKEN option. //
43// //
44// The first method is convenient if all the S3 files you want to //
45// access are hosted by a single provider. The second one is more //
46// flexible as it allows you to specify which credentials to use //
47// on a per-file basis. See the documentation of the constructor of //
48// this class for details on the syntax. //
49// //
50// For generating and signing the HTTP request, this class uses //
51// TS3HTTPRequest. //
52// //
53// For more information on the details of S3 protocol please refer to: //
54// "Amazon Simple Storage Service Developer Guide": //
55// http://docs.amazonwebservices.com/AmazonS3/latest/dev/Welcome.html //
56// //
57// "Amazon Simple Storage Service REST API Reference" //
58// http://docs.amazonwebservices.com/AmazonS3/latest/API/APIRest.html //
59//////////////////////////////////////////////////////////////////////////
60
61#include "TS3WebFile.h"
62#include "TROOT.h"
63#include "TError.h"
64#include "TSystem.h"
65#include "TPRegexp.h"
66#include "TEnv.h"
67
68
70
71////////////////////////////////////////////////////////////////////////////////
72/// Construct a TS3WebFile object. The path argument is a URL of one of the
73/// following forms:
74///
75/// s3://host.example.com/bucket/path/to/my/file
76/// s3http://host.example.com/bucket/path/to/my/file
77/// s3https://host.example.com/bucket/path/to/my/file
78/// as3://host.example.com/bucket/path/to/my/file
79///
80/// For files hosted by Google Storage, use the following forms:
81///
82/// gs://storage.googleapis.com/bucket/path/to/my/file
83/// gshttp://storage.googleapis.com/bucket/path/to/my/file
84/// gsthttps://storage.googleapis.com/bucket/path/to/my/file
85///
86/// The 'as3' scheme is accepted for backwards compatibility but its usage is
87/// deprecated.
88///
89/// The recommended way to create an instance of this class is through
90/// TFile::Open, for instance:
91///
92/// TFile* f1 = TFile::Open("s3://host.example.com/bucket/path/to/my/file")
93/// TFile* f2 = TFile::Open("gs://storage.googleapis.com/bucket/path/to/my/file")
94///
95/// The specified scheme (i.e. s3, s3http, s3https, ...) determines the underlying
96/// transport protocol to use for downloading the file contents, namely HTTP or HTTPS.
97/// The 's3', 's3https', 'gs' and 'gshttps' schemes imply using HTTPS as the transport
98/// protocol. The 's3http', 'as3' and 'gshttp' schemes imply using HTTP as the transport
99/// protocol.
100///
101/// The 'options' argument can contain 'NOPROXY' if you want to bypass
102/// the HTTP proxy when retrieving this file's contents. As for any TWebFile-derived
103/// object, the URL of the web proxy can be specified by setting an environmental
104/// variable 'http_proxy'. If this variable is set, we ask that proxy to route our
105/// requests HTTP(S) requests to the file server.
106///
107/// In addition, you can also use the 'options' argument to provide the access key
108/// and secret key to be used for authentication purposes for this file by using a
109/// string of the form "AUTH=myAccessKey:mySecretkey". This may be useful to
110/// open several files hosted by different providers in the same program/macro,
111/// where the environemntal variables solution is not convenient (see below).
112///
113/// To use AWS temporary security credentials you need to specify the session
114/// token. This can be added to the options argument with a string of the form
115/// TOKEN=mySessionToken. The temporary access and secret keys must also be
116/// available, either via the AUTH option or by environmental variable.
117///
118/// If you need to specify more than one option separate them by ' '
119/// (blank), for instance:
120/// "NOPROXY AUTH=F38XYZABCDeFgH4D0E1F:V+frt4re7J1euSNFnmaf8wwmI4AAAE7kzxZ/TTM+"
121///
122/// Examples:
123/// TFile* f1 = TFile::Open("s3://host.example.com/bucket/path/to/my/file",
124/// "NOPROXY AUTH=F38XYZABCDeFgH4D0E1F:V+frt4re7J1euSNFnmaf8wwmI4AAAE7kzxZ/TTM+");
125/// TFile* f2 = TFile::Open("s3://host.example.com/bucket/path/to/my/file",
126/// "AUTH=F38XYZABCDeFgH4D0E1F:V+frt4re7J1euSNFnmaf8wwmI4AAAE7kzxZ/TTM+");
127/// TFile* f3 = TFile::Open("s3://host.example.com/bucket/path/to/my/file",
128/// "TOKEN=AQoDYXdzEM///////////wEa8AHEYmCinjD+TsGEjtgKSMAT6wnY");
129///
130/// If there is no authentication information in the 'options' argument
131/// (i.e. not AUTH="....") the values of the environmental variables
132/// S3_ACCESS_KEY and S3_SECRET_KEY (if set) are expected to contain
133/// the access key id and the secret access key, respectively. You have
134/// been provided with these credentials by your S3 service provider.
135///
136/// If neither the AUTH information is provided in the 'options' argument
137/// nor the environmental variables are set, we try to open the file
138/// without providing any authentication information to the server. This
139/// is useful when the file is set an access control that allows for
140/// any unidentified user to read the file.
141
142TS3WebFile::TS3WebFile(const char* path, Option_t* options)
143 : TWebFile(path, "IO")
144{
145 // Make sure this is a valid S3 path. We accept 'as3' as a scheme, for
146 // backwards compatibility
147 Bool_t doMakeZombie = kFALSE;
148 TString errorMsg;
149 TString accessKey;
150 TString secretKey;
151 TString token;
152 TPMERegexp rex("^([a]?s3|s3http[s]?|gs|gshttp[s]?){1}://([^/]+)/([^/]+)/([^/].*)", "i");
153 if (rex.Match(TString(path)) != 5) {
154 errorMsg = TString::Format("invalid S3 path '%s'", path);
155 doMakeZombie = kTRUE;
156 }
157 else if (!ParseOptions(options, accessKey, secretKey, token)) {
158 errorMsg = TString::Format("could not parse options '%s'", options);
159 doMakeZombie = kTRUE;
160 }
161
162 // Should we stop initializing this object?
163 if (doMakeZombie) {
164 Error("TS3WebFile", "%s", (const char*)errorMsg);
165 MakeZombie();
167 return;
168 }
169
170 // Set this S3 object's URL, the bucket name this file is located in
171 // and the object key
172 fS3Request.SetBucket(rex[3]);
173 fS3Request.SetObjectKey(TString::Format("/%s", (const char*)rex[4]));
174
175 // Initialize super-classes data members (fUrl is a data member of
176 // super-super class TFile)
177 TString protocol = "https";
178 if (rex[1].EndsWith("http", TString::kIgnoreCase) ||
179 rex[1].EqualTo("as3", TString::kIgnoreCase))
180 protocol = "http";
181 fUrl.SetUrl(TString::Format("%s://%s/%s/%s", (const char*)protocol,
182 (const char*)rex[2], (const char*)rex[3], (const char*)rex[4]));
183
184 // Set S3-specific data members. If the access and secret keys are not
185 // provided in the 'options' argument we look in the environmental
186 // variables.
187 const char* kAccessKeyEnv = "S3_ACCESS_KEY";
188 const char* kSecretKeyEnv = "S3_SECRET_KEY";
189 const char* kSessionToken = "S3_SESSION_TOKEN";
190 if (accessKey.IsNull())
191 GetCredentialsFromEnv(kAccessKeyEnv, kSecretKeyEnv, kSessionToken,
192 accessKey, secretKey, token);
193
194 // Initialize the S3 HTTP request
196 if (accessKey.IsNull() || secretKey.IsNull()) {
197 // We have no authentication information, neither in the options
198 // nor in the enviromental variables. So may be this is a
199 // world-readable file, so let's continue and see if
200 // we can open it.
202 } else {
203 // Set the authentication information we need to use
204 // for this file
205 fS3Request.SetAuthKeys(accessKey, secretKey);
206 if (!token.IsNull())
208 if (rex[1].BeginsWith("gs"))
210 else
212 }
213
214 // Assume this server does not serve multi-range HTTP GET requests. We
215 // will detect this when the HTTP headers of this files are retrieved
216 // later in the initialization process
218
219 // Call super-class initializer
221
222 // Were there some errors opening this file?
223 if (IsZombie() && (accessKey.IsNull() || secretKey.IsNull())) {
224 // We could not open the file and we have no authentication information
225 // so inform the user so that they can check.
226 Error("TS3WebFile", "could not find authentication info in "\
227 "'options' argument and at least one of the environment variables '%s' or '%s' is not set",
228 kAccessKeyEnv, kSecretKeyEnv);
229 }
230}
231
232
233////////////////////////////////////////////////////////////////////////////////
234/// Extracts the S3 authentication key pair (access key and secret key)
235/// from the options. The authentication credentials can be specified in
236/// the options provided to the constructor of this class as a string
237/// containing: "AUTH=<access key>:<secret key>" and can include other
238/// options, for instance "NOPROXY" for not using the HTTP proxy for
239/// accessing this file's contents.
240/// For instance:
241/// "NOPROXY AUTH=F38XYZABCDeFgHiJkLm:V+frt4re7J1euSNFnmaf8wwmI401234E7kzxZ/TTM+"
242/// A security token may be given by the TOKEN option, in order to allow the
243/// use of a temporary key pair.
244
245Bool_t TS3WebFile::ParseOptions(Option_t* options, TString& accessKey, TString& secretKey, TString& token)
246{
247 TString optStr = (const char*)options;
248 if (optStr.IsNull())
249 return kTRUE;
250
252 if (optStr.Contains("NOPROXY", TString::kIgnoreCase))
253 fNoProxy = kTRUE;
254 CheckProxy();
255
256 // Look in the options string for the authentication information.
257 TPMERegexp rex_token("(^TOKEN=|^.* TOKEN=)([\\S]+)[\\s]*.*$", "i");
258 if (rex_token.Match(optStr) == 3) {
259 token = rex_token[2];
260 }
261 TPMERegexp rex("(^AUTH=|^.* AUTH=)([a-z0-9]+):([a-z0-9+/]+)[\\s]*.*$", "i");
262 if (rex.Match(optStr) == 4) {
263 accessKey = rex[2];
264 secretKey = rex[3];
265 }
266 if (gDebug > 0)
267 Info("ParseOptions", "using authentication information from 'options' argument");
268 return kTRUE;
269}
270
271
272////////////////////////////////////////////////////////////////////////////////
273/// Overwrites TWebFile::GetHead() for retrieving the HTTP headers of this
274/// file. Uses TS3HTTPRequest to generate an HTTP HEAD request which includes
275/// the authorization header expected by the S3 server.
276
278{
280 return TWebFile::GetHead();
281}
282
283
284////////////////////////////////////////////////////////////////////////////////
285/// Overwrites TWebFile::SetMsgReadBuffer10() for setting the HTTP GET
286/// request compliant to the authentication mechanism used by the S3
287/// protocol. The GET request must contain an "Authorization" header with
288/// the signature of the request, generated using the user's secret access
289/// key.
290
291void TS3WebFile::SetMsgReadBuffer10(const char* redirectLocation, Bool_t tempRedirect)
292{
293 TWebFile::SetMsgReadBuffer10(redirectLocation, tempRedirect);
295 return;
296}
297
298
299////////////////////////////////////////////////////////////////////////////////
300
302{
303 // Overwrites TWebFile::ReadBuffers() for reading specified byte ranges.
304 // According to the kind of server this file is hosted by, we use a
305 // single HTTP request with a muti-range header or we generate multiple
306 // requests with a single range each.
307
308 // Does this server support multi-range GET requests?
309 if (fUseMultiRange)
310 return TWebFile::ReadBuffers(buf, pos, len, nbuf);
311
312 // Send multiple GET requests with a single range of bytes
313 // Adapted from original version by Wang Lu
314 for (Int_t i=0, offset=0; i < nbuf; i++) {
315 TString rangeHeader = TString::Format("Range: bytes=%lld-%lld\r\n\r\n",
316 pos[i], pos[i] + len[i] - 1);
317 TString s3Request = fS3Request.GetRequest(TS3HTTPRequest::kGET, kFALSE) + rangeHeader;
318 if (GetFromWeb10(&buf[offset], len[i], s3Request) == -1)
319 return kTRUE;
320 offset += len[i];
321 }
322 return kFALSE;
323}
324
325
326////////////////////////////////////////////////////////////////////////////////
327/// This method is called by the super-class TWebFile when a HTTP header
328/// for this file is retrieved. We scan the 'Server' header to detect the
329/// type of S3 server this file is hosted on and to determine if it is
330/// known to support multi-range HTTP GET requests. Some S3 servers (for
331/// instance Amazon's) do not support that feature and when they
332/// receive a multi-range request they sent back the whole file contents.
333/// For this class, if the server do not support multirange requests
334/// we issue multiple single-range requests instead.
335
337{
338 TPMERegexp rex("^Server: (.+)", "i");
339 if (rex.Match(headerLine) != 2)
340 return;
341
342 // Extract the identity of this server and compare it to the
343 // identify of the servers known to support multi-range requests.
344 // The list of server identities is expected to be found in ROOT
345 // configuration.
346 TString serverId = rex[1].ReplaceAll("\r", "").ReplaceAll("\n", "");
347 TString multirangeServers(gEnv->GetValue("TS3WebFile.Root.MultiRangeServer", ""));
348 fUseMultiRange = multirangeServers.Contains(serverId, TString::kIgnoreCase) ? kTRUE : kFALSE;
349}
350
351
352////////////////////////////////////////////////////////////////////////////////
353/// Sets the access and secret keys from the environmental variables, if
354/// they are both set. Sets the security session token if it is given.
355
356Bool_t TS3WebFile::GetCredentialsFromEnv(const char* accessKeyEnv, const char* secretKeyEnv,
357 const char* tokenEnv, TString& outAccessKey,
358 TString& outSecretKey, TString& outToken)
359{
360 // Look first in the recommended environmental variables. Both variables
361 // must be set.
362 TString accKey = gSystem->Getenv(accessKeyEnv);
363 TString secKey = gSystem->Getenv(secretKeyEnv);
364 TString token = gSystem->Getenv(tokenEnv);
365 if (!token.IsNull()) {
366 outToken = token;
367 }
368 if (!accKey.IsNull() && !secKey.IsNull()) {
369 outAccessKey = accKey;
370 outSecretKey = secKey;
371 if (gDebug > 0)
372 Info("GetCredentialsFromEnv", "using authentication information from environmental variables '%s' and '%s'",
373 accessKeyEnv, secretKeyEnv);
374 return kTRUE;
375 }
376
377 // Look now in the legacy environmental variables, for keeping backwards
378 // compatibility.
379 accKey = gSystem->Getenv("S3_ACCESS_ID"); // Legacy access key
380 secKey = gSystem->Getenv("S3_ACCESS_KEY"); // Legacy secret key
381 if (!accKey.IsNull() && !secKey.IsNull()) {
382 Warning("SetAuthKeys", "usage of S3_ACCESS_ID and S3_ACCESS_KEY environmental variables is deprecated.");
383 Warning("SetAuthKeys", "please use S3_ACCESS_KEY and S3_SECRET_KEY environmental variables.");
384 outAccessKey = accKey;
385 outSecretKey = secKey;
386 return kTRUE;
387 }
388
389 return kFALSE;
390}
391
const Bool_t kFALSE
Definition: RtypesCore.h:90
R__EXTERN Int_t gDebug
Definition: RtypesCore.h:117
long long Long64_t
Definition: RtypesCore.h:71
const Bool_t kTRUE
Definition: RtypesCore.h:89
const char Option_t
Definition: RtypesCore.h:64
#define ClassImp(name)
Definition: Rtypes.h:361
#define gDirectory
Definition: TDirectory.h:229
R__EXTERN TEnv * gEnv
Definition: TEnv.h:171
#define gROOT
Definition: TROOT.h:406
R__EXTERN TSystem * gSystem
Definition: TSystem.h:556
virtual Int_t GetValue(const char *name, Int_t dflt) const
Returns the integer value for a resource.
Definition: TEnv.cxx:491
TUrl fUrl
!URL of file
Definition: TFile.h:110
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition: TObject.cxx:877
R__ALWAYS_INLINE Bool_t IsZombie() const
Definition: TObject.h:149
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:891
void MakeZombie()
Definition: TObject.h:49
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition: TObject.cxx:865
Wrapper for PCRE library (Perl Compatible Regular Expressions).
Definition: TPRegexp.h:97
Int_t Match(const TString &s, UInt_t start=0)
Runs a match on s against the regex 'this' was created with.
Definition: TPRegexp.cxx:708
TS3HTTPRequest & SetObjectKey(const TString &objectKey)
TString GetRequest(TS3HTTPRequest::EHTTPVerb httpVerb, Bool_t appendCRLF=kTRUE)
Returns the HTTP request ready to be sent to the server.
TS3HTTPRequest & SetAuthKeys(const TString &accessKey, const TString &secretKey)
TS3HTTPRequest & SetBucket(const TString &bucket)
TS3HTTPRequest & SetSessionToken(const TString &token)
TS3HTTPRequest & SetAuthType(TS3HTTPRequest::EAuthType authType)
TS3HTTPRequest & SetHost(const TString &host)
TS3HTTPRequest fS3Request
Definition: TS3WebFile.h:87
Bool_t ParseOptions(Option_t *options, TString &accessKey, TString &secretKey, TString &token)
Extracts the S3 authentication key pair (access key and secret key) from the options.
Definition: TS3WebFile.cxx:245
virtual Int_t GetHead()
Overwrites TWebFile::GetHead() for retrieving the HTTP headers of this file.
Definition: TS3WebFile.cxx:277
virtual void ProcessHttpHeader(const TString &headerLine)
This method is called by the super-class TWebFile when a HTTP header for this file is retrieved.
Definition: TS3WebFile.cxx:336
virtual void SetMsgReadBuffer10(const char *redirectLocation=0, Bool_t tempRedirect=kFALSE)
Overwrites TWebFile::SetMsgReadBuffer10() for setting the HTTP GET request compliant to the authentic...
Definition: TS3WebFile.cxx:291
Bool_t fUseMultiRange
Definition: TS3WebFile.h:88
Bool_t GetCredentialsFromEnv(const char *accessKeyEnv, const char *secretKeyEnv, const char *tokenEnv, TString &outAccessKey, TString &outSecretKey, TString &outToken)
Sets the access and secret keys from the environmental variables, if they are both set.
Definition: TS3WebFile.cxx:356
virtual Bool_t ReadBuffers(char *buf, Long64_t *pos, Int_t *len, Int_t nbuf)
Read specified byte ranges from remote file via HTTP daemon.
Definition: TS3WebFile.cxx:301
Basic string class.
Definition: TString.h:131
@ kIgnoreCase
Definition: TString.h:263
Bool_t IsNull() const
Definition: TString.h:402
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition: TString.cxx:2311
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:619
virtual const char * Getenv(const char *env)
Get environment variable.
Definition: TSystem.cxx:1658
void SetUrl(const char *url, Bool_t defaultIsFile=kFALSE)
Parse url character string and split in its different subcomponents.
Definition: TUrl.cxx:108
const char * GetHost() const
Definition: TUrl.h:69
virtual Int_t GetHead()
Get the HTTP header.
Definition: TWebFile.cxx:1028
virtual void SetMsgReadBuffer10(const char *redirectLocation=0, Bool_t tempRedirect=kFALSE)
Set GET command for use by ReadBuffer(s)10(), handle redirection if needed.
Definition: TWebFile.cxx:266
virtual Bool_t ReadBuffers(char *buf, Long64_t *pos, Int_t *len, Int_t nbuf)
Read specified byte ranges from remote file via HTTP daemon.
Definition: TWebFile.cxx:499
virtual void CheckProxy()
Check if shell var "http_proxy" has been set and should be used.
Definition: TWebFile.cxx:351
TString fMsgGetHead
Definition: TWebFile.h:50
virtual void Init(Bool_t readHeadOnly)
Initialize a TWebFile object.
Definition: TWebFile.cxx:210
TString fMsgReadBuffer10
Definition: TWebFile.h:49
virtual Int_t GetFromWeb10(char *buf, Int_t len, const TString &msg, Int_t nseg=0, Long64_t *seg_pos=0, Int_t *seg_len=0)
Read multiple byte range request from web server.
Definition: TWebFile.cxx:674
Bool_t fNoProxy
Definition: TWebFile.h:47
bool BeginsWith(const std::string &theString, const std::string &theSubstring)
bool EndsWith(const std::string &theString, const std::string &theSubstring)