ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
TPRegexp.cxx
Go to the documentation of this file.
1 // @(#)root/base:$Id$
2 // Author: Eddy Offermann 24/06/05
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2005, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 /* \class TPRegexp
13 
14 C++ Wrapper for the "Perl Compatible Regular Expressions" library
15  The PCRE lib can be found at: http://www.pcre.org/
16 
17 Extensive documentation about Regular expressions in Perl can be
18 found at : http://perldoc.perl.org/perlre.html
19 */
20 
21 #include "Riostream.h"
22 #include "TPRegexp.h"
23 #include "TObjArray.h"
24 #include "TObjString.h"
25 #include "TError.h"
26 
27 #include <pcre.h>
28 
29 #include <vector>
30 #include <stdexcept>
31 
32 struct PCREPriv_t {
33  pcre *fPCRE;
34  pcre_extra *fPCREExtra;
35 
36  PCREPriv_t() { fPCRE = 0; fPCREExtra = 0; }
37 };
38 
39 
41 
42 Bool_t TPRegexp::fgThrowAtCompileError = kFALSE;
43 
44 ////////////////////////////////////////////////////////////////////////////////
45 /// Default ctor.
46 
48 {
49  fPriv = new PCREPriv_t;
50  fPCREOpts = 0;
51 }
52 
53 ////////////////////////////////////////////////////////////////////////////////
54 /// Create and initialize with pat.
55 
57 {
58  fPattern = pat;
59  fPriv = new PCREPriv_t;
60  fPCREOpts = 0;
61 }
62 
63 ////////////////////////////////////////////////////////////////////////////////
64 /// Copy ctor.
65 
67 {
68  fPattern = p.fPattern;
69  fPriv = new PCREPriv_t;
70  fPCREOpts = p.fPCREOpts;
71 }
72 
73 ////////////////////////////////////////////////////////////////////////////////
74 /// Cleanup.
75 
77 {
78  if (fPriv->fPCRE)
79  pcre_free(fPriv->fPCRE);
80  if (fPriv->fPCREExtra)
81  pcre_free(fPriv->fPCREExtra);
82  delete fPriv;
83 }
84 
85 ////////////////////////////////////////////////////////////////////////////////
86 /// Assignment operator.
87 
89 {
90  if (this != &p) {
91  fPattern = p.fPattern;
92  if (fPriv->fPCRE)
93  pcre_free(fPriv->fPCRE);
94  fPriv->fPCRE = 0;
95  if (fPriv->fPCREExtra)
96  pcre_free(fPriv->fPCREExtra);
97  fPriv->fPCREExtra = 0;
98  fPCREOpts = p.fPCREOpts;
99  }
100  return *this;
101 }
102 
103 ////////////////////////////////////////////////////////////////////////////////
104 /// Translate Perl modifier flags into pcre flags.
105 /// The supported modStr characters are: g, i, m, o, s, x, and the
106 /// special d for debug. The meaning of the letters is:
107 /// - m
108 /// Treat string as multiple lines. That is, change "^" and "$" from
109 /// matching the start or end of the string to matching the start or
110 /// end of any line anywhere within the string.
111 /// - s
112 /// Treat string as single line. That is, change "." to match any
113 /// character whatsoever, even a newline, which normally it would not match.
114 /// Used together, as /ms, they let the "." match any character whatsoever,
115 /// while still allowing "^" and "$" to match, respectively, just after and
116 /// just before newlines within the string.
117 /// - i
118 /// Do case-insensitive pattern matching.
119 /// - x
120 /// Extend your pattern's legibility by permitting whitespace and comments.
121 /// - p
122 /// Preserve the string matched such that ${^PREMATCH}, ${^MATCH},
123 /// and ${^POSTMATCH} are available for use after matching.
124 /// - g and c
125 /// Global matching, and keep the Current position after failed matching.
126 /// Unlike i, m, s and x, these two flags affect the way the regex is used
127 /// rather than the regex itself. See Using regular expressions in Perl in
128 /// perlretut for further explanation of the g and c modifiers.
129 /// For more detail see: http://perldoc.perl.org/perlre.html#Modifiers.
130 
131 UInt_t TPRegexp::ParseMods(const TString &modStr) const
132 {
133  UInt_t opts = 0;
134 
135  if (modStr.Length() <= 0)
136  return fPCREOpts;
137 
138  //translate perl flags into pcre flags
139  const char *m = modStr;
140  while (*m) {
141  switch (*m) {
142  case 'g':
143  opts |= kPCRE_GLOBAL;
144  break;
145  case 'i':
146  opts |= PCRE_CASELESS;
147  break;
148  case 'm':
149  opts |= PCRE_MULTILINE;
150  break;
151  case 'o':
152  opts |= kPCRE_OPTIMIZE;
153  break;
154  case 's':
155  opts |= PCRE_DOTALL;
156  break;
157  case 'x':
158  opts |= PCRE_EXTENDED;
159  break;
160  case 'd': // special flag to enable debug printing (not Perl compat.)
161  opts |= kPCRE_DEBUG_MSGS;
162  break;
163  default:
164  Error("ParseMods", "illegal pattern modifier: %c", *m);
165  opts = 0;
166  }
167  ++m;
168  }
169  return opts;
170 }
171 
172 ////////////////////////////////////////////////////////////////////////////////
173 /// Return PCRE modifier options as string.
174 /// For meaning of mods see ParseMods().
175 
177 {
178  TString ret;
179 
180  if (fPCREOpts & kPCRE_GLOBAL) ret += 'g';
181  if (fPCREOpts & PCRE_CASELESS) ret += 'i';
182  if (fPCREOpts & PCRE_MULTILINE) ret += 'm';
183  if (fPCREOpts & PCRE_DOTALL) ret += 's';
184  if (fPCREOpts & PCRE_EXTENDED) ret += 'x';
185  if (fPCREOpts & kPCRE_OPTIMIZE) ret += 'o';
186  if (fPCREOpts & kPCRE_DEBUG_MSGS) ret += 'd';
187 
188  return ret;
189 }
190 
191 ////////////////////////////////////////////////////////////////////////////////
192 /// Compile the fPattern.
193 
195 {
196  if (fPriv->fPCRE)
197  pcre_free(fPriv->fPCRE);
198 
200  Info("Compile", "PREGEX compiling %s", fPattern.Data());
201 
202  const char *errstr;
203  Int_t patIndex;
204  fPriv->fPCRE = pcre_compile(fPattern.Data(), fPCREOpts & kPCRE_INTMASK,
205  &errstr, &patIndex, 0);
206 
207  if (!fPriv->fPCRE) {
208  if (fgThrowAtCompileError) {
209  throw std::runtime_error
210  (TString::Format("TPRegexp::Compile() compilation of TPRegexp(%s) failed at: %d because %s",
211  fPattern.Data(), patIndex, errstr).Data());
212  } else {
213  Error("Compile", "compilation of TPRegexp(%s) failed at: %d because %s",
214  fPattern.Data(), patIndex, errstr);
215  return;
216  }
217  }
218 
219  if (fPriv->fPCREExtra || (fPCREOpts & kPCRE_OPTIMIZE))
220  Optimize();
221 }
222 
223 ////////////////////////////////////////////////////////////////////////////////
224 /// Send the pattern through the optimizer.
225 
227 {
228  if (fPriv->fPCREExtra)
229  pcre_free(fPriv->fPCREExtra);
230 
232  Info("Optimize", "PREGEX studying %s", fPattern.Data());
233 
234  const char *errstr;
235  // pcre_study allows less options - see pcre_internal.h PUBLIC_STUDY_OPTIONS.
236  fPriv->fPCREExtra = pcre_study(fPriv->fPCRE, 0, &errstr);
237 
238  if (!fPriv->fPCREExtra && errstr) {
239  Error("Optimize", "Optimization of TPRegexp(%s) failed: %s",
240  fPattern.Data(), errstr);
241  }
242 }
243 
244 ////////////////////////////////////////////////////////////////////////////////
245 /// Returns the number of expanded '$' constructs.
246 
248  const TString &replacePattern,
249  Int_t *offVec, Int_t nrMatch) const
250 {
251  Int_t nrSubs = 0;
252  const char *p = replacePattern;
253 
254  Int_t state = 0;
255  Int_t subnum = 0;
256  while (state != -1) {
257  switch (state) {
258  case 0:
259  if (!*p) {
260  state = -1;
261  break;
262  }
263  if (*p == '$') {
264  state = 1;
265  subnum = 0;
266  if (p[1] == '&') {
267  p++;
268  if (isdigit(p[1]))
269  p++;
270  } else if (!isdigit(p[1])) {
271  Error("ReplaceSubs", "badly formed replacement pattern: %s",
272  replacePattern.Data());
273  }
274  } else
275  final += *p;
276  break;
277  case 1:
278  if (isdigit(*p)) {
279  subnum *= 10;
280  subnum += (*p)-'0';
281  } else {
283  Info("ReplaceSubs", "PREGEX appending substr #%d", subnum);
284  if (subnum < 0 || subnum > nrMatch-1) {
285  Error("ReplaceSubs","bad string number: %d",subnum);
286  } else {
287  const TString subStr = s(offVec[2*subnum],offVec[2*subnum+1]-offVec[2*subnum]);
288  final += subStr;
289  nrSubs++;
290  }
291  state = 0;
292  continue; // send char to start state
293  }
294  }
295  p++;
296  }
297  return nrSubs;
298 }
299 
300 ////////////////////////////////////////////////////////////////////////////////
301 /// Perform the actual matching - protected method.
302 
304  Int_t nMaxMatch, TArrayI *pos) const
305 {
306  Int_t *offVec = new Int_t[3*nMaxMatch];
307  // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
308  Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
309  s.Length(), start, 0,
310  offVec, 3*nMaxMatch);
311 
312  if (nrMatch == PCRE_ERROR_NOMATCH)
313  nrMatch = 0;
314  else if (nrMatch <= 0) {
315  Error("Match","pcre_exec error = %d", nrMatch);
316  delete [] offVec;
317  return 0;
318  }
319 
320  if (pos)
321  pos->Set(2*nrMatch, offVec);
322  delete [] offVec;
323 
324  return nrMatch;
325 }
326 
327 ////////////////////////////////////////////////////////////////////////////////
328 /// The number of matches is returned, this equals the full match +
329 /// sub-pattern matches.
330 /// nMaxMatch is the maximum allowed number of matches.
331 /// pos contains the string indices of the matches. Its usage is
332 /// shown in the routine MatchS.
333 /// For meaning of mods see ParseMods().
334 
336  Int_t nMaxMatch, TArrayI *pos)
337 {
338  UInt_t opts = ParseMods(mods);
339 
340  if (!fPriv->fPCRE || opts != fPCREOpts) {
341  fPCREOpts = opts;
342  Compile();
343  }
344 
345  return MatchInternal(s, start, nMaxMatch, pos);
346 }
347 
348 
349 ////////////////////////////////////////////////////////////////////////////////
350 /// Returns a TObjArray of matched substrings as TObjString's.
351 /// The TObjArray is owner of the objects and must be deleted by the user.
352 /// The first entry is the full matched pattern, followed by the sub-patterns.
353 /// If a pattern was not matched, it will return an empty substring:
354 /// ~~~ {.cpp}
355 /// TObjArray *subStrL = TPRegexp("(a|(z))(bc)").MatchS("abc");
356 /// for (Int_t i = 0; i < subStrL->GetLast()+1; i++) {
357 /// const TString subStr = ((TObjString *)subStrL->At(i))->GetString();
358 /// std::cout << "\"" << subStr << "\" ";
359 /// }
360 /// std::cout << subStr << std::endl;
361 /// ~~~
362 /// produces: "abc" "a" "" "bc"
363 ///
364 /// For meaning of mods see ParseMods().
365 
366 TObjArray *TPRegexp::MatchS(const TString &s, const TString &mods,
367  Int_t start, Int_t nMaxMatch)
368 {
369  TArrayI pos;
370  Int_t nrMatch = Match(s, mods, start, nMaxMatch, &pos);
371 
372  TObjArray *subStrL = new TObjArray();
373  subStrL->SetOwner();
374 
375  for (Int_t i = 0; i < nrMatch; i++) {
376  Int_t startp = pos[2*i];
377  Int_t stopp = pos[2*i+1];
378  if (startp >= 0 && stopp >= 0) {
379  const TString subStr = s(pos[2*i], pos[2*i+1]-pos[2*i]);
380  subStrL->Add(new TObjString(subStr));
381  } else
382  subStrL->Add(new TObjString());
383  }
384 
385  return subStrL;
386 }
387 
388 ////////////////////////////////////////////////////////////////////////////////
389 /// Perform pattern substitution with optional back-ref replacement
390 /// - protected method.
391 
393  Int_t start, Int_t nMaxMatch,
394  Bool_t doDollarSubst) const
395 {
396  Int_t *offVec = new Int_t[3*nMaxMatch];
397 
398  TString final;
399  Int_t nrSubs = 0;
400  Int_t offset = start;
401  Int_t last = 0;
402 
403  while (kTRUE) {
404 
405  // find next matching subs
406  // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
407  Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
408  s.Length(), offset, 0,
409  offVec, 3*nMaxMatch);
410 
411  if (nrMatch == PCRE_ERROR_NOMATCH) {
412  nrMatch = 0;
413  break;
414  } else if (nrMatch <= 0) {
415  Error("Substitute", "pcre_exec error = %d", nrMatch);
416  break;
417  }
418 
419  // append anything previously unmatched, but not substituted
420  if (last <= offVec[0]) {
421  final += s(last,offVec[0]-last);
422  last = offVec[1];
423  }
424 
425  // replace stuff in s
426  if (doDollarSubst) {
427  ReplaceSubs(s, final, replacePattern, offVec, nrMatch);
428  } else {
429  final += replacePattern;
430  }
431  ++nrSubs;
432 
433  // if global gotta check match at every pos
434  if (!(fPCREOpts & kPCRE_GLOBAL))
435  break;
436 
437  if (offVec[0] != offVec[1])
438  offset = offVec[1];
439  else {
440  // matched empty string
441  if (offVec[1] == s.Length())
442  break;
443  offset = offVec[1]+1;
444  }
445  }
446 
447  delete [] offVec;
448 
449  final += s(last,s.Length()-last);
450  s = final;
451 
452  return nrSubs;
453 }
454 
455 ////////////////////////////////////////////////////////////////////////////////
456 /// Substitute replaces the string s by a new string in which matching
457 /// patterns are replaced by the replacePattern string. The number of
458 /// substitutions are returned.
459 /// ~~~ {.cpp}
460 /// TString s("aap noot mies");
461 /// const Int_t nrSub = TPRegexp("(\\w*) noot (\\w*)").Substitute(s,"$2 noot $1");
462 /// std::cout << nrSub << " \"" << s << "\"" <<std::endl;
463 /// ~~~
464 /// produces: 2 "mies noot aap"
465 ///
466 /// For meaning of mods see ParseMods().
467 
468 Int_t TPRegexp::Substitute(TString &s, const TString &replacePattern,
469  const TString &mods, Int_t start, Int_t nMaxMatch)
470 {
471  UInt_t opts = ParseMods(mods);
472 
473  if (!fPriv->fPCRE || opts != fPCREOpts) {
474  fPCREOpts = opts;
475  Compile();
476  }
477 
478  return SubstituteInternal(s, replacePattern, start, nMaxMatch, kTRUE);
479 }
480 
481 
482 ////////////////////////////////////////////////////////////////////////////////
483 /// Returns true if underlying PCRE structure has been successfully
484 /// generated via regexp compilation.
485 
487 {
488  return fPriv->fPCRE != 0;
489 }
490 
491 ////////////////////////////////////////////////////////////////////////////////
492 /// Get value of static flag controlling whether exception should be thrown upon an
493 /// error during regular expression compilation by the PCRE engine.
494 
496 {
497  return fgThrowAtCompileError;
498 }
499 
500 ////////////////////////////////////////////////////////////////////////////////
501 /// Set static flag controlling whether exception should be thrown upon an
502 /// error during regular expression compilation by the PCRE engine.
503 
505 {
506  fgThrowAtCompileError = throwp;
507 }
508 
509 ////////////////////////////////////////////////////////////////////////////////
510 // //
511 // TString member functions, put here so the linker will include //
512 // them only if regular expressions are used. //
513 // //
514 ////////////////////////////////////////////////////////////////////////////////
515 
516 ////////////////////////////////////////////////////////////////////////////////
517 /// Find the first occurrence of the regexp in string and return the position.
518 /// Start is the offset at which the search should start.
519 
521 {
522  TArrayI pos;
523  Int_t nrMatch = r.Match(*this,"",start,10,&pos);
524  if (nrMatch > 0)
525  return pos[0];
526  else
527  return -1;
528 }
529 
530 ////////////////////////////////////////////////////////////////////////////////
531 /// Find the first occurrence of the regexp in string and return the position.
532 /// Extent is length of the matched string and start is the offset at which
533 /// the matching should start.
534 
536 {
537  TArrayI pos;
538  const Int_t nrMatch = r.Match(*this,"",start,10,&pos);
539  if (nrMatch > 0) {
540  *extent = pos[1]-pos[0];
541  return pos[0];
542  } else {
543  *extent = 0;
544  return -1;
545  }
546 }
547 
548 ////////////////////////////////////////////////////////////////////////////////
549 /// Return the substring found by applying the regexp starting at start.
550 
552 {
553  Ssiz_t len;
554  Ssiz_t begin = Index(r, &len, start);
555  return TSubString(*this, begin, len);
556 }
557 
558 ////////////////////////////////////////////////////////////////////////////////
559 /// Return the substring found by applying the regexp.
560 
562 {
563  return (*this)(r, 0);
564 }
565 
566 
567 /** \class TPMERegexp
568 
569 Wrapper for PCRE library (Perl Compatible Regular Expressions).
570 Based on PME - PCRE Made Easy by Zachary Hansen.
571 
572 Supports main Perl operations using regular expressions (Match,
573 Substitute and Split). To retrieve the results one can simply use
574 operator[] returning a TString.
575 
576 See $ROOTSYS/tutorials/regexp_pme.C for examples.
577 */
578 
580 
581 ////////////////////////////////////////////////////////////////////////////////
582 /// Default constructor. This regexp will match an empty string.
583 
585  TPRegexp(),
586  fNMaxMatches(10),
587  fNMatches(0),
588  fAddressOfLastString(0),
589  fLastGlobalPosition(0)
590 {
591  Compile();
592 }
593 
594 ////////////////////////////////////////////////////////////////////////////////
595 /// Constructor.
596 ///
597 /// \param[in] s string to compile into regular expression
598 /// \param[in] opts perl-style character flags to be set on TPME object
599 /// \param[in] nMatchMax maximum number of matches
600 
601 TPMERegexp::TPMERegexp(const TString& s, const TString& opts, Int_t nMatchMax) :
602  TPRegexp(s),
603  fNMaxMatches(nMatchMax),
604  fNMatches(0),
605  fAddressOfLastString(0),
606  fLastGlobalPosition(0)
607 {
608  fPCREOpts = ParseMods(opts);
609  Compile();
610 }
611 
612 ////////////////////////////////////////////////////////////////////////////////
613 /// Constructor.
614 ///
615 /// \param[in] s string to compile into regular expression
616 /// \param[in] opts PCRE-style option flags to be set on TPME object
617 /// \param[in] nMatchMax maximum number of matches
618 
619 TPMERegexp::TPMERegexp(const TString& s, UInt_t opts, Int_t nMatchMax) :
620  TPRegexp(s),
621  fNMaxMatches(nMatchMax),
622  fNMatches(0),
623  fAddressOfLastString(0),
624  fLastGlobalPosition(0)
625 {
626  fPCREOpts = opts;
627  Compile();
628 }
629 
630 ////////////////////////////////////////////////////////////////////////////////
631 /// Copy constructor.
632 /// Only PCRE specifics are copied, not last-match or global-match
633 /// information.
634 
636  TPRegexp(r),
637  fNMaxMatches(r.fNMaxMatches),
638  fNMatches(0),
639  fAddressOfLastString(0),
640  fLastGlobalPosition(0)
641 {
642  Compile();
643 }
644 
645 ////////////////////////////////////////////////////////////////////////////////
646 /// Reset the pattern and options.
647 /// If 'nMatchMax' other than -1 (the default) is passed, it is also set.
648 
649 void TPMERegexp::Reset(const TString& s, const TString& opts, Int_t nMatchMax)
650 {
651  Reset(s, ParseMods(opts), nMatchMax);
652 }
653 
654 ////////////////////////////////////////////////////////////////////////////////
655 /// Reset the pattern and options.
656 /// If 'nMatchMax' other than -1 (the default) is passed, it is also set.
657 
658 void TPMERegexp::Reset(const TString& s, UInt_t opts, Int_t nMatchMax)
659 {
660  fPattern = s;
661  fPCREOpts = opts;
662  Compile();
663 
664  if (nMatchMax != -1)
665  fNMatches = nMatchMax;
666  fNMatches = 0;
668 }
669 
670 ////////////////////////////////////////////////////////////////////////////////
671 /// Copy global-match state from 're; so that this regexp can continue
672 /// parsing the string from where 're' left off.
673 ///
674 /// Alternatively, GetGlobalPosition() get be used to retrieve the
675 /// last match position so that it can passed to Match().
676 ///
677 /// Ideally, as it is done in PERL, the last match position would be
678 /// stored in the TString itself.
679 
681 {
684 }
685 
686 ////////////////////////////////////////////////////////////////////////////////
687 /// Reset state of global match.
688 /// This happens automatically when a new string is passed for matching.
689 /// But be carefull, as the address of last TString object is used
690 /// to make this decision.
691 
693 {
695 }
696 
697 ////////////////////////////////////////////////////////////////////////////////
698 /// Runs a match on s against the regex 'this' was created with.
699 ///
700 /// \param[in] s string to match against
701 /// \param[in] start offset at which to start matching
702 /// \return number of matches found
703 
705 {
706  // If we got a new string, reset the global position counter.
707  if (fAddressOfLastString != (void*) &s) {
709  }
710 
711  if (fPCREOpts & kPCRE_GLOBAL) {
712  start += fLastGlobalPosition;
713  }
714 
715  //fprintf(stderr, "string: '%s' length: %d offset: %d\n", s.Data(), s.length(), offset);
717 
718  //fprintf(stderr, "MatchInternal_exec result = %d\n", fNMatches);
719 
720  fLastStringMatched = s;
721  fAddressOfLastString = (void*) &s;
722 
723  if (fPCREOpts & kPCRE_GLOBAL) {
724  if (fNMatches == PCRE_ERROR_NOMATCH) {
725  // fprintf(stderr, "TPME RESETTING: reset for no match\n");
726  fLastGlobalPosition = 0; // reset the position for next match (perl does this)
727  } else if (fNMatches > 0) {
728  // fprintf(stderr, "TPME RESETTING: setting to %d\n", marks[0].second);
729  fLastGlobalPosition = fMarkers[1]; // set to the end of the match
730  } else {
731  // fprintf(stderr, "TPME RESETTING: reset for no unknown\n");
733  }
734  }
735 
736  return fNMatches;
737 }
738 
739 ////////////////////////////////////////////////////////////////////////////////
740 /// Splits into at most maxfields. If maxfields is unspecified or
741 /// 0, trailing empty matches are discarded. If maxfields is
742 /// positive, no more than maxfields fields will be returned and
743 /// trailing empty matches are preserved. If maxfields is empty,
744 /// all fields (including trailing empty ones) are returned. This
745 /// *should* be the same as the perl behaviour.
746 ///
747 /// If pattern produces sub-matches, these are also stored in
748 /// the result.
749 ///
750 /// A pattern matching the null string will split the value of EXPR
751 /// into separate characters at each point it matches that way.
752 ///
753 /// \param[in] s string to split
754 /// \param[in] maxfields maximum number of fields to be split out. 0 means
755 /// split all fields, but discard any trailing empty bits.
756 /// Negative means split all fields and keep trailing empty bits.
757 /// Positive means keep up to N fields including any empty fields
758 /// less than N. Anything remaining is in the last field.
759 /// \return number of fields found
760 
761 Int_t TPMERegexp::Split(const TString& s, Int_t maxfields)
762 {
763  typedef std::pair<int, int> MarkerLoc_t;
764  typedef std::vector<MarkerLoc_t> MarkerLocVec_t;
765 
766  // stores the marks for the split
767  MarkerLocVec_t oMarks;
768 
769  // this is a list of current trailing empty matches if maxfields is
770  // unspecified or 0. If there is stuff in it and a non-empty match
771  // is found, then everything in here is pushed into oMarks and then
772  // the new match is pushed on. If the end of the string is reached
773  // and there are empty matches in here, they are discarded.
774  MarkerLocVec_t oCurrentTrailingEmpties;
775 
776  Int_t nOffset = 0;
777  Int_t nMatchesFound = 0;
778 
779  // while we are still finding matches and maxfields is 0 or negative
780  // (meaning we get all matches), or we haven't gotten to the number
781  // of specified matches
782  Int_t matchRes;
783  while ((matchRes = Match(s, nOffset)) &&
784  ((maxfields < 1) || nMatchesFound < maxfields)) {
785  ++nMatchesFound;
786 
787  if (fMarkers[1] - fMarkers[0] == 0) {
788  oMarks.push_back(MarkerLoc_t(nOffset, nOffset + 1));
789  ++nOffset;
790  if (nOffset >= s.Length())
791  break;
792  else
793  continue;
794  }
795 
796  // match can be empty
797  if (nOffset != fMarkers[0]) {
798  if (!oCurrentTrailingEmpties.empty()) {
799  oMarks.insert(oMarks.end(),
800  oCurrentTrailingEmpties.begin(),
801  oCurrentTrailingEmpties.end());
802  oCurrentTrailingEmpties.clear();
803  }
804  oMarks.push_back(MarkerLoc_t(nOffset, fMarkers[0]));
805  } else {
806  // empty match
807  if (maxfields == 0) {
808  // store for possible later inclusion
809  oCurrentTrailingEmpties.push_back(MarkerLoc_t(nOffset, nOffset));
810  } else {
811  oMarks.push_back(MarkerLoc_t(nOffset, nOffset));
812  }
813  }
814 
815  nOffset = fMarkers[1];
816 
817  if (matchRes > 1) {
818  for (Int_t i = 1; i < matchRes; ++i)
819  oMarks.push_back(MarkerLoc_t(fMarkers[2*i], fMarkers[2*i + 1]));
820  }
821  }
822 
823 
824  // if there were no matches found, push the whole thing on
825  if (nMatchesFound == 0) {
826  oMarks.push_back(MarkerLoc_t(0, s.Length()));
827  }
828  // if we ran out of matches, then append the rest of the string
829  // onto the end of the last split field
830  else if (maxfields > 0 && nMatchesFound >= maxfields) {
831  oMarks[oMarks.size() - 1].second = s.Length();
832  }
833  // else we have to add another entry for the end of the string
834  else {
835  Bool_t last_empty = (nOffset == s.Length());
836  if (!last_empty || maxfields < 0) {
837  if (!oCurrentTrailingEmpties.empty()) {
838  oMarks.insert(oMarks.end(),
839  oCurrentTrailingEmpties.begin(),
840  oCurrentTrailingEmpties.end());
841  }
842  oMarks.push_back(MarkerLoc_t(nOffset, s.Length()));
843  }
844  }
845 
846  fNMatches = oMarks.size();
848  for (Int_t i = 0; i < fNMatches; ++i) {
849  fMarkers[2*i] = oMarks[i].first;
850  fMarkers[2*i + 1] = oMarks[i].second;
851  }
852 
853  // fprintf(stderr, "match returning %d\n", fNMatches);
854  return fNMatches;
855 }
856 
857 ////////////////////////////////////////////////////////////////////////////////
858 /// Substitute matching part of s with r, dollar back-ref
859 /// substitution is performed if doDollarSubst is true (default).
860 /// Returns the number of substitutions made.
861 ///
862 /// After the substitution, another pass is made over the resulting
863 /// string and the following special tokens are interpreted:
864 /// - `\l` lowercase next char,
865 /// - `\u` uppercase next char,
866 /// - `\L` lowercase till `\E`,
867 /// - `\U` uppercase till `\E`, and
868 /// - `\E` end case modification.
869 
871 {
872  Int_t cnt = SubstituteInternal(s, r, 0, fNMaxMatches, doDollarSubst);
873 
874  TString ret;
875  Int_t state = 0;
876  Ssiz_t pos = 0, len = s.Length();
877  const Char_t *data = s.Data();
878  while (pos < len) {
879  Char_t c = data[pos];
880  if (c == '\\') {
881  c = data[pos+1]; // Rely on string-data being null-terminated.
882  switch (c) {
883  case 0 : ret += '\\'; break;
884  case 'l': state = 1; break;
885  case 'u': state = 2; break;
886  case 'L': state = 3; break;
887  case 'U': state = 4; break;
888  case 'E': state = 0; break;
889  default : ret += '\\'; ret += c; break;
890  }
891  pos += 2;
892  } else {
893  switch (state) {
894  case 0: ret += c; break;
895  case 1: ret += (Char_t) tolower(c); state = 0; break;
896  case 2: ret += (Char_t) toupper(c); state = 0; break;
897  case 3: ret += (Char_t) tolower(c); break;
898  case 4: ret += (Char_t) toupper(c); break;
899  default: Error("TPMERegexp::Substitute", "invalid state.");
900  }
901  ++pos;
902  }
903  }
904 
905  s = ret;
906 
907  return cnt;
908 }
909 
910 ////////////////////////////////////////////////////////////////////////////////
911 /// Returns the sub-string from the internal fMarkers vector.
912 /// Requires having run match or split first.
913 
915 {
916  if (index >= fNMatches)
917  return "";
918 
919  Int_t begin = fMarkers[2*index];
920  Int_t end = fMarkers[2*index + 1];
921  return fLastStringMatched(begin, end-begin);
922 }
923 
924 ////////////////////////////////////////////////////////////////////////////////
925 /// Print the regular expression and modifier options.
926 /// If 'option' contains "all", prints also last string match and
927 /// match results.
928 
930 {
931  TString opt = option;
932  opt.ToLower();
933 
934  Printf("Regexp='%s', Opts='%s'", fPattern.Data(), GetModifiers().Data());
935  if (opt.Contains("all")) {
936  Printf(" last string='%s'", fLastStringMatched.Data());
937  Printf(" number of matches = %d", fNMatches);
938  for (Int_t i=0; i<fNMatches; ++i)
939  Printf(" %d - %s", i, operator[](i).Data());
940  }
941 }
942 
943 
944 /** \class TStringToken
945 Provides iteration through tokens of a given string.
946 
947  - fFullStr stores the string to be split. It is never modified.
948  - fSplitRe is the perl-re that is used to separate the tokens.
949  - fReturnVoid if true, empty strings will be returned.
950 
951 Current token is stored in the TString base-class.
952 During construction no match is done, use NextToken() to get the first
953 and all subsequent tokens.
954 */
955 
957 
958 ////////////////////////////////////////////////////////////////////////////////
959 /// Constructor.
960 
961 TStringToken::TStringToken(const TString& fullStr, const TString& splitRe, Bool_t retVoid) :
962  fFullStr (fullStr),
963  fSplitRe (splitRe),
964  fReturnVoid (retVoid),
965  fPos (0)
966 {
967 }
968 
969 ////////////////////////////////////////////////////////////////////////////////
970 /// Get the next token, it is stored in this TString.
971 /// Returns true if new token is available, false otherwise.
972 
974 {
975  TArrayI x;
976  while (fPos < fFullStr.Length()) {
977  if (fSplitRe.Match(fFullStr, "", fPos, 2, &x)) {
979  fPos = x[1];
980  } else {
982  fPos = fFullStr.Length() + 1;
983  }
984  if (Length() || fReturnVoid)
985  return kTRUE;
986  }
987 
988  // Special case: void-strings are requested and the full-string
989  // ends with the separator. Thus we return another empty string.
990  if (fPos == fFullStr.Length() && fReturnVoid) {
991  TString::operator=("");
992  fPos = fFullStr.Length() + 1;
993  return kTRUE;
994  }
995 
996  return kFALSE;
997 }
A zero length substring is legal.
Definition: TString.h:83
Bool_t IsValid() const
Returns true if underlying PCRE structure has been successfully generated via regexp compilation...
Definition: TPRegexp.cxx:486
TString fLastStringMatched
Definition: TPRegexp.h:113
An array of TObjects.
Definition: TObjArray.h:39
Int_t fPos
Definition: TPRegexp.h:155
Int_t fNMatches
Definition: TPRegexp.h:110
Ssiz_t Length() const
Definition: TString.h:390
Collectable string class.
Definition: TObjString.h:32
return c
const char Option_t
Definition: RtypesCore.h:62
tuple offset
Definition: tree.py:93
TArrayI fMarkers
Definition: TPRegexp.h:111
void AssignGlobalState(const TPMERegexp &re)
Copy global-match state from 're; so that this regexp can continue parsing the string from where 're'...
Definition: TPRegexp.cxx:680
virtual void SetOwner(Bool_t enable=kTRUE)
Set whether this collection is the owner (enable==true) of its content.
TObjArray * MatchS(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10)
Returns a TObjArray of matched substrings as TObjString's.
Definition: TPRegexp.cxx:366
static void SetThrowAtCompileError(Bool_t throwp)
Set static flag controlling whether exception should be thrown upon an error during regular expressio...
Definition: TPRegexp.cxx:504
virtual ~TPRegexp()
Cleanup.
Definition: TPRegexp.cxx:76
Basic string class.
Definition: TString.h:137
void ToLower()
Change string to lower-case.
Definition: TString.cxx:1075
TString GetModifiers() const
Return PCRE modifier options as string.
Definition: TPRegexp.cxx:176
int Int_t
Definition: RtypesCore.h:41
TString & operator=(char s)
Assign character c to TString.
Definition: TString.cxx:245
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
Int_t Substitute(TString &s, const TString &replace, const TString &mods="", Int_t start=0, Int_t nMatchMax=10)
Substitute replaces the string s by a new string in which matching patterns are replaced by the repla...
Definition: TPRegexp.cxx:468
TPRegexp fSplitRe
Definition: TPRegexp.h:153
Array of integers (32 bits per element).
Definition: TArrayI.h:29
const char * Data() const
Definition: TString.h:349
PCREPriv_t * fPriv
Definition: TPRegexp.h:53
Int_t fNMaxMatches
Definition: TPRegexp.h:109
Double_t x[n]
Definition: legend1.C:17
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2321
Provides iteration through tokens of a given string.
Definition: TPRegexp.h:149
void Compile()
Compile the fPattern.
Definition: TPRegexp.cxx:194
void Info(const char *location, const char *msgfmt,...)
std::vector< std::vector< double > > Data
void Set(Int_t n)
Set size of this array to n ints.
Definition: TArrayI.cxx:105
void Error(const char *location, const char *msgfmt,...)
char & operator()(Ssiz_t i)
Definition: TString.h:657
ROOT::R::TRInterface & r
Definition: Object.C:4
Int_t Split(const TString &s, Int_t maxfields=0)
Splits into at most maxfields.
Definition: TPRegexp.cxx:761
void * fAddressOfLastString
Definition: TPRegexp.h:114
TPRegexp & operator=(const TPRegexp &p)
Assignment operator.
Definition: TPRegexp.cxx:88
unsigned int UInt_t
Definition: RtypesCore.h:42
TMarker * m
Definition: textangle.C:8
virtual void Print(Option_t *option="")
Print the regular expression and modifier options.
Definition: TPRegexp.cxx:929
ClassImp(TPMERegexp)
#define Printf
Definition: TGeoToOCC.h:18
int Ssiz_t
Definition: RtypesCore.h:63
UInt_t ParseMods(const TString &mods) const
Translate Perl modifier flags into pcre flags.
Definition: TPRegexp.cxx:131
UInt_t fPCREOpts
Definition: TPRegexp.h:54
TPMERegexp()
Default constructor. This regexp will match an empty string.
Definition: TPRegexp.cxx:584
Int_t SubstituteInternal(TString &s, const TString &replace, Int_t start, Int_t nMaxMatch0, Bool_t doDollarSubst) const
Perform pattern substitution with optional back-ref replacement.
Definition: TPRegexp.cxx:392
Int_t MatchInternal(const TString &s, Int_t start, Int_t nMaxMatch, TArrayI *pos=0) const
Perform the actual matching - protected method.
Definition: TPRegexp.cxx:303
Bool_t NextToken()
Get the next token, it is stored in this TString.
Definition: TPRegexp.cxx:973
static Bool_t fgThrowAtCompileError
Definition: TPRegexp.h:56
static Bool_t GetThrowAtCompileError()
Get value of static flag controlling whether exception should be thrown upon an error during regular ...
Definition: TPRegexp.cxx:495
Int_t Match(const TString &s, UInt_t start=0)
Runs a match on s against the regex 'this' was created with.
Definition: TPRegexp.cxx:704
char Char_t
Definition: RtypesCore.h:29
const TString fFullStr
Definition: TPRegexp.h:152
Wrapper for PCRE library (Perl Compatible Regular Expressions).
Definition: TPRegexp.h:103
Int_t fLastGlobalPosition
Definition: TPRegexp.h:116
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:567
Bool_t fReturnVoid
Definition: TPRegexp.h:154
Int_t ReplaceSubs(const TString &s, TString &final, const TString &replacePattern, Int_t *ovec, Int_t nmatch) const
Returns the number of expanded '$' constructs.
Definition: TPRegexp.cxx:247
void Add(TObject *obj)
Definition: TObjArray.h:75
void Reset(const TString &s, const TString &opts="", Int_t nMatchMax=-1)
Reset the pattern and options.
Definition: TPRegexp.cxx:649
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:582
Int_t Substitute(TString &s, const TString &r, Bool_t doDollarSubst=kTRUE)
Substitute matching part of s with r, dollar back-ref substitution is performed if doDollarSubst is t...
Definition: TPRegexp.cxx:870
const Bool_t kTRUE
Definition: Rtypes.h:91
friend class TSubString
Definition: TString.h:140
Int_t Match(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10, TArrayI *pos=0)
The number of matches is returned, this equals the full match + sub-pattern matches.
Definition: TPRegexp.cxx:335
TString operator[](Int_t)
Returns the sub-string from the internal fMarkers vector.
Definition: TPRegexp.cxx:914
const char * cnt
Definition: TXMLSetup.cxx:75
TString fPattern
Definition: TPRegexp.h:52
void Optimize()
Send the pattern through the optimizer.
Definition: TPRegexp.cxx:226
void ResetGlobalState()
Reset state of global match.
Definition: TPRegexp.cxx:692