TRegexp - source file

// @(#)root/base:$Id$
// Author: Fons Rademakers   04/08/95

/*************************************************************************
 * Copyright (C) 1995-2000, Rene Brun and Fons Rademakers.               *
 * All rights reserved.                                                  *
 *                                                                       *
 * For the licensing terms see $ROOTSYS/LICENSE.                         *
 * For the list of contributors see $ROOTSYS/README/CREDITS.             *
 *************************************************************************/

//////////////////////////////////////////////////////////////////////////
//                                                                      //
// TRegexp                                                              //
//                                                                      //
// Regular expression class.                                            //
//                                                                      //
//   '^'             // start-of-line anchor                            //
//   '$'             // end-of-line anchor                              //
//   '.'             // matches any character                           //
//   '['             // start a character class                         //
//   ']'             // end a character class                           //
//   '^'             // negates character class if 1st character        //
//   '*'             // Kleene closure (matches 0 or more)              //
//   '+'             // Positive closure (1 or more)                    //
//   '?'             // Optional closure (0 or 1)                       //
//                                                                      //
//   Note that the '|' operator (union) is not supported, nor are       //
//   parentheses (grouping). Therefore "a|b" does not match "a".        //
//                                                                      //
//   Standard classes like [:alnum:], [:alpha:], etc. are not supported,//
//   only [a-zA-Z], [^ntf] and so on.                                   //
//                                                                      //
//////////////////////////////////////////////////////////////////////////

#include "TRegexp.h"
#include "TString.h"
#include "TError.h"
#include "ThreadLocalStorage.h"

const unsigned TRegexp::fgMaxpat = 2048;


ClassImp(TRegexp)

//______________________________________________________________________________
TRegexp::TRegexp(const char *re, Bool_t wildcard)
{
   // Create a regular expression from the input string. If wildcard is
   // true then the input string will first be interpreted as a wildcard
   // expression by MakeWildcard(), and the result then interpreted as a
   // regular expression.

   if (wildcard)
      GenPattern(MakeWildcard(re));
   else
      GenPattern(re);
}

//______________________________________________________________________________
TRegexp::TRegexp(const TString& re)
{
   // Create a regular expression from a TString.

   GenPattern(re.Data());
}

//______________________________________________________________________________
TRegexp::TRegexp(const TRegexp& r)
{
   // Copy ctor.

   CopyPattern(r);
}

//______________________________________________________________________________
TRegexp::~TRegexp()
{
   // Destructor.
   delete [] fPattern;
}

//______________________________________________________________________________
TRegexp& TRegexp::operator=(const TRegexp& r)
{
   // Assignment operator.

   if (this != &r) {
      delete [] fPattern;
      CopyPattern(r);
   }
   return *this;
}

//______________________________________________________________________________
TRegexp& TRegexp::operator=(const char *str)
{
   // Assignment operator taking a char* and assigning it to a regexp.

   delete [] fPattern;
   GenPattern(str);
   return *this;
}

//______________________________________________________________________________
TRegexp& TRegexp::operator=(const TString &str)
{
   // Assignment operator taking a TString.

   delete [] fPattern;
   GenPattern(str.Data());
   return *this;
}

//______________________________________________________________________________
void TRegexp::GenPattern(const char *str)
{
   // Generate the regular expression pattern.

   fPattern = new Pattern_t[fgMaxpat];
   int error = ::Makepat(str, fPattern, fgMaxpat);
   fStat = (error < 3) ? (EStatVal) error : kToolong;
}

//______________________________________________________________________________
void TRegexp::CopyPattern(const TRegexp& r)
{
   // Copy the regular expression pattern.

   fPattern = new Pattern_t[fgMaxpat];
   memcpy(fPattern, r.fPattern, fgMaxpat * sizeof(Pattern_t));
   fStat = r.fStat;
}

//______________________________________________________________________________
const char *TRegexp::MakeWildcard(const char *re)
{
   // This routine transforms a wildcarding regular expression into
   // a general regular expression used for pattern matching.
   // When using wildcards the regular expression is assumed to be
   // preceded by a "^" (BOL) and terminated by a "$" (EOL). Also, all
   // "*"'s and "?"'s (closures) are assumed to be preceded by a "." (i.e. any
   // character, except "/"'s) and all .'s are escaped (so *.ps is different
   // from *.eps). The special treatment of "/" allows the easy matching of
   // pathnames, e.g. "*.root" will match "aap.root", but not "pipo/aap.root".

#ifdef R__HAS_THREAD_LOCAL
   thread_local char buf[fgMaxpat];
#else
   //static char buf[fgMaxpat];
   typedef char buf_t[fgMaxpat];
   buf_t &buf( TTHREAD_TLS_INIT_ARRAY<4 /* must be unique */, buf_t, char>() );
#endif
   char *s = buf;
   if (!re) return "";
   int len = strlen(re);
   int slen = 0;

   if (!len) return "";

   for (int i = 0; i < len; i++) {
      if ((unsigned)slen > fgMaxpat - 10) {
         Error("MakeWildcard", "regexp too large");
         break;
      }
      if (i == 0 && re[i] != '^') {
         *s++ = '^';
         slen++;
      }
      if (re[i] == '*') {
#ifndef R__WIN32
         //const char *wc = "[a-zA-Z0-9-+_\\.,: []<>]";
         const char *wc = "[^/]";
#else
         //const char *wc = "[a-zA-Z0-9-+_., []<>]";
         const char *wc = "[^\\/:]";
#endif
         strcpy(s, wc);
         s += strlen(wc);
         slen += strlen(wc);
      }
      if (re[i] == '.') {
         *s++ = '\\';
         slen++;
      }
      if (re[i] == '?') {
#ifndef R__WIN32
         //const char *wc = "[a-zA-Z0-9-+_\\.,: []<>]";
         const char *wc = "[^/]";
#else
         //const char *wc = "[a-zA-Z0-9-+_., []<>]";
         const char *wc = "[^\\/:]";
#endif
         strcpy(s, wc);
         s += strlen(wc);
         slen += strlen(wc);
      } else {
         *s++ = re[i];
         slen++;
      }
      if (i == len-1 && re[i] != '$') {
         *s++ = '$';
         slen++;
      }
   }
   *s = '\0';
   return buf;
}

//______________________________________________________________________________
Ssiz_t TRegexp::Index(const TString& string, Ssiz_t* len, Ssiz_t i) const
{
   // Find the first occurance of the regexp in string and return the
   // position, or -1 if there is no match. Len is length of the matched
   // string and i is the offset at which the matching should start.

   if (fStat != kOK)
      Error("TRegexp::Index", "Bad Regular Expression");

   const char* startp;
   const char* s = string.Data();
   Ssiz_t slen = string.Length();
   if (slen < i) return kNPOS;
   const char* endp = ::Matchs(s+i, slen-i, fPattern, &startp);
   if (endp) {
      *len = endp - startp;
      return startp - s;
   } else {
      *len = 0;
      return kNPOS;
   }
}

//______________________________________________________________________________
TRegexp::EStatVal TRegexp::Status()
{
   // Check status of regexp.

   EStatVal temp = fStat;
   fStat = kOK;
   return temp;
}

//////////////////////////////////////////////////////////////////////////
//                                                                      //
// TString member functions, put here so the linker will include        //
// them only if regular expressions are used.                           //
//                                                                      //
//////////////////////////////////////////////////////////////////////////

//______________________________________________________________________________
Ssiz_t TString::Index(const TRegexp& r, Ssiz_t start) const
{
   // Find the first occurance of the regexp in string and return the
   // position, or -1 if there is no match. Start is the offset at which
   // the search should start.

   Ssiz_t len;
   return r.Index(*this, &len, start); // len not used
}

//______________________________________________________________________________
Ssiz_t TString::Index(const TRegexp& r, Ssiz_t* extent, Ssiz_t start) const
{
   // Find the first occurance of the regexp in string and return the
   // position, or -1 if there is no match. Extent is length of the matched
   // string and start is the offset at which the matching should start.

   return r.Index(*this, extent, start);
}

//______________________________________________________________________________
TSubString TString::operator()(const TRegexp& r, Ssiz_t start) const
{
   // Return the substring found by applying the regexp starting at start.

   Ssiz_t len;
   Ssiz_t begin = Index(r, &len, start);
   return TSubString(*this, begin, len);
}

//______________________________________________________________________________
TSubString TString::operator()(const TRegexp& r) const
{
   // Return the substring found by applying the regexp.

   return (*this)(r,0);
}

//__________________________________________________________________________________
Bool_t TString::Tokenize(TString &tok, Ssiz_t &from, const char *delim) const
{
   // Search for tokens delimited by regular expression 'delim' (default " ")
   // in this string; search starts at 'from' and the token is returned in 'tok'.
   // Returns in 'from' the next position after the delimiter.
   // Returns kTRUE if a token is found, kFALSE if not or if some inconsistency
   // occured.
   // This method allows to loop over tokens in this way:
   //
   //    TString myl = "tok1 tok2|tok3";
   //    TString tok;
   //    Ssiz_t from = 0;
   //    while (myl.Tokenize(tok, from, "[ |]")) {
   //       // Analyse tok
   //       ...
   //    }
   //
   // more convenient of the other Tokenize method when saving the tokens is not
   // needed.

   Bool_t found = kFALSE;

   // Reset the token
   tok = "";

   // Make sure inputs make sense
   Int_t len = Length();
   if (len <= 0 || from > (len - 1) || from < 0)
      return found;

   TRegexp rg(delim);

   // Find delimiter
   Int_t ext = 0;
   Int_t pos = Index(rg, &ext, from);

   // Assign to token
   if (pos == kNPOS || pos > from) {
      Ssiz_t last = (pos != kNPOS) ? (pos - 1) : len;
      tok = (*this)(from, last-from+1);
   }
   found = kTRUE;

   // Update start-of-search index
   from = pos + ext;
   if (pos == kNPOS) {
      from = pos;
      if (tok.IsNull()) {
         // Empty, last token
         found = kFALSE;
      }
   }
   // Make sure that 'from' has a meaningful value
   from = (from < len) ? from : len;

   // Done
   return found;
}

TRegexp.cxx:1

TRegexp.cxx:2

TRegexp.cxx:3

TRegexp.cxx:4

TRegexp.cxx:5

TRegexp.cxx:6

TRegexp.cxx:7

TRegexp.cxx:8

TRegexp.cxx:9

TRegexp.cxx:10

TRegexp.cxx:11

TRegexp.cxx:12

TRegexp.cxx:13

TRegexp.cxx:14

TRegexp.cxx:15

TRegexp.cxx:16

TRegexp.cxx:17

TRegexp.cxx:18

TRegexp.cxx:19

TRegexp.cxx:20

TRegexp.cxx:21

TRegexp.cxx:22

TRegexp.cxx:23

TRegexp.cxx:24

TRegexp.cxx:25

TRegexp.cxx:26

TRegexp.cxx:27

TRegexp.cxx:28

TRegexp.cxx:29

TRegexp.cxx:30

TRegexp.cxx:31

TRegexp.cxx:32

TRegexp.cxx:33

TRegexp.cxx:34

TRegexp.cxx:35

TRegexp.cxx:36

TRegexp.cxx:37

TRegexp.cxx:38

TRegexp.cxx:39

TRegexp.cxx:40

TRegexp.cxx:41

TRegexp.cxx:42

TRegexp.cxx:43

TRegexp.cxx:44

TRegexp.cxx:45

TRegexp.cxx:46

TRegexp.cxx:47

TRegexp.cxx:48

TRegexp.cxx:49

TRegexp.cxx:50

TRegexp.cxx:51

TRegexp.cxx:52

TRegexp.cxx:53

TRegexp.cxx:54

TRegexp.cxx:55

TRegexp.cxx:56

TRegexp.cxx:57

TRegexp.cxx:58

TRegexp.cxx:59

TRegexp.cxx:60

TRegexp.cxx:61

TRegexp.cxx:62

TRegexp.cxx:63

TRegexp.cxx:64

TRegexp.cxx:65

TRegexp.cxx:66

TRegexp.cxx:67

TRegexp.cxx:68

TRegexp.cxx:69

TRegexp.cxx:70

TRegexp.cxx:71

TRegexp.cxx:72

TRegexp.cxx:73

TRegexp.cxx:74

TRegexp.cxx:75

TRegexp.cxx:76

TRegexp.cxx:77

TRegexp.cxx:78

TRegexp.cxx:79

TRegexp.cxx:80

TRegexp.cxx:81

TRegexp.cxx:82

TRegexp.cxx:83

TRegexp.cxx:84

TRegexp.cxx:85

TRegexp.cxx:86

TRegexp.cxx:87

TRegexp.cxx:88

TRegexp.cxx:89

TRegexp.cxx:90

TRegexp.cxx:91

TRegexp.cxx:92

TRegexp.cxx:93

TRegexp.cxx:94

TRegexp.cxx:95

TRegexp.cxx:96

TRegexp.cxx:97

TRegexp.cxx:98

TRegexp.cxx:99

TRegexp.cxx:100

TRegexp.cxx:101

TRegexp.cxx:102

TRegexp.cxx:103

TRegexp.cxx:104

TRegexp.cxx:105

TRegexp.cxx:106

TRegexp.cxx:107

TRegexp.cxx:108

TRegexp.cxx:109

TRegexp.cxx:110

TRegexp.cxx:111

TRegexp.cxx:112

TRegexp.cxx:113

TRegexp.cxx:114

TRegexp.cxx:115

TRegexp.cxx:116

TRegexp.cxx:117

TRegexp.cxx:118

TRegexp.cxx:119

TRegexp.cxx:120

TRegexp.cxx:121

TRegexp.cxx:122

TRegexp.cxx:123

TRegexp.cxx:124

TRegexp.cxx:125

TRegexp.cxx:126

TRegexp.cxx:127

TRegexp.cxx:128

TRegexp.cxx:129

TRegexp.cxx:130

TRegexp.cxx:131

TRegexp.cxx:132

TRegexp.cxx:133

TRegexp.cxx:134

TRegexp.cxx:135

TRegexp.cxx:136

TRegexp.cxx:137

TRegexp.cxx:138

TRegexp.cxx:139

TRegexp.cxx:140

TRegexp.cxx:141

TRegexp.cxx:142

TRegexp.cxx:143

TRegexp.cxx:144

TRegexp.cxx:145

TRegexp.cxx:146

TRegexp.cxx:147

TRegexp.cxx:148

TRegexp.cxx:149

TRegexp.cxx:150

TRegexp.cxx:151

TRegexp.cxx:152

TRegexp.cxx:153

TRegexp.cxx:154

TRegexp.cxx:155

TRegexp.cxx:156

TRegexp.cxx:157

TRegexp.cxx:158

TRegexp.cxx:159

TRegexp.cxx:160

TRegexp.cxx:161

TRegexp.cxx:162

TRegexp.cxx:163

TRegexp.cxx:164

TRegexp.cxx:165

TRegexp.cxx:166

TRegexp.cxx:167

TRegexp.cxx:168

TRegexp.cxx:169

TRegexp.cxx:170

TRegexp.cxx:171

TRegexp.cxx:172

TRegexp.cxx:173

TRegexp.cxx:174

TRegexp.cxx:175

TRegexp.cxx:176

TRegexp.cxx:177

TRegexp.cxx:178

TRegexp.cxx:179

TRegexp.cxx:180

TRegexp.cxx:181

TRegexp.cxx:182

TRegexp.cxx:183

TRegexp.cxx:184

TRegexp.cxx:185

TRegexp.cxx:186

TRegexp.cxx:187

TRegexp.cxx:188

TRegexp.cxx:189

TRegexp.cxx:190

TRegexp.cxx:191

TRegexp.cxx:192

TRegexp.cxx:193

TRegexp.cxx:194

TRegexp.cxx:195

TRegexp.cxx:196

TRegexp.cxx:197

TRegexp.cxx:198

TRegexp.cxx:199

TRegexp.cxx:200

TRegexp.cxx:201

TRegexp.cxx:202

TRegexp.cxx:203

TRegexp.cxx:204

TRegexp.cxx:205

TRegexp.cxx:206

TRegexp.cxx:207

TRegexp.cxx:208

TRegexp.cxx:209

TRegexp.cxx:210

TRegexp.cxx:211

TRegexp.cxx:212

TRegexp.cxx:213

TRegexp.cxx:214

TRegexp.cxx:215

TRegexp.cxx:216

TRegexp.cxx:217

TRegexp.cxx:218

TRegexp.cxx:219

TRegexp.cxx:220

TRegexp.cxx:221

TRegexp.cxx:222

TRegexp.cxx:223

TRegexp.cxx:224

TRegexp.cxx:225

TRegexp.cxx:226

TRegexp.cxx:227

TRegexp.cxx:228

TRegexp.cxx:229

TRegexp.cxx:230

TRegexp.cxx:231

TRegexp.cxx:232

TRegexp.cxx:233

TRegexp.cxx:234

TRegexp.cxx:235

TRegexp.cxx:236

TRegexp.cxx:237

TRegexp.cxx:238

TRegexp.cxx:239

TRegexp.cxx:240

TRegexp.cxx:241

TRegexp.cxx:242

TRegexp.cxx:243

TRegexp.cxx:244

TRegexp.cxx:245

TRegexp.cxx:246

TRegexp.cxx:247

TRegexp.cxx:248

TRegexp.cxx:249

TRegexp.cxx:250

TRegexp.cxx:251

TRegexp.cxx:252

TRegexp.cxx:253

TRegexp.cxx:254

TRegexp.cxx:255

TRegexp.cxx:256

TRegexp.cxx:257

TRegexp.cxx:258

TRegexp.cxx:259

TRegexp.cxx:260

TRegexp.cxx:261

TRegexp.cxx:262

TRegexp.cxx:263

TRegexp.cxx:264

TRegexp.cxx:265

TRegexp.cxx:266

TRegexp.cxx:267

TRegexp.cxx:268

TRegexp.cxx:269

TRegexp.cxx:270

TRegexp.cxx:271

TRegexp.cxx:272

TRegexp.cxx:273

TRegexp.cxx:274

TRegexp.cxx:275

TRegexp.cxx:276

TRegexp.cxx:277

TRegexp.cxx:278

TRegexp.cxx:279

TRegexp.cxx:280

TRegexp.cxx:281

TRegexp.cxx:282

TRegexp.cxx:283

TRegexp.cxx:284

TRegexp.cxx:285

TRegexp.cxx:286

TRegexp.cxx:287

TRegexp.cxx:288

TRegexp.cxx:289

TRegexp.cxx:290

TRegexp.cxx:291

TRegexp.cxx:292

TRegexp.cxx:293

TRegexp.cxx:294

TRegexp.cxx:295

TRegexp.cxx:296

TRegexp.cxx:297

TRegexp.cxx:298

TRegexp.cxx:299

TRegexp.cxx:300

TRegexp.cxx:301

TRegexp.cxx:302

TRegexp.cxx:303

TRegexp.cxx:304

TRegexp.cxx:305

TRegexp.cxx:306

TRegexp.cxx:307

TRegexp.cxx:308

TRegexp.cxx:309

TRegexp.cxx:310

TRegexp.cxx:311

TRegexp.cxx:312

TRegexp.cxx:313

TRegexp.cxx:314

TRegexp.cxx:315

TRegexp.cxx:316

TRegexp.cxx:317

TRegexp.cxx:318

TRegexp.cxx:319

TRegexp.cxx:320

TRegexp.cxx:321

TRegexp.cxx:322

TRegexp.cxx:323

TRegexp.cxx:324

TRegexp.cxx:325

TRegexp.cxx:326

TRegexp.cxx:327

TRegexp.cxx:328

TRegexp.cxx:329

TRegexp.cxx:330

TRegexp.cxx:331

TRegexp.cxx:332

TRegexp.cxx:333

TRegexp.cxx:334

TRegexp.cxx:335

TRegexp.cxx:336

TRegexp.cxx:337

TRegexp.cxx:338

TRegexp.cxx:339

TRegexp.cxx:340

TRegexp.cxx:341

TRegexp.cxx:342

TRegexp.cxx:343

TRegexp.cxx:344

TRegexp.cxx:345

TRegexp.cxx:346

TRegexp.cxx:347

TRegexp.cxx:348