Logo ROOT   6.19/01
Reference Guide
TPRegexp.cxx
Go to the documentation of this file.
1 // @(#)root/base:$Id$
2 // Author: Eddy Offermann 24/06/05
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2005, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 /* \class TPRegexp
13 \ingroup Base
14 
15 C++ Wrapper for the "Perl Compatible Regular Expressions" library
16  The PCRE lib can be found at: http://www.pcre.org/
17 
18 Extensive documentation about Regular expressions in Perl can be
19 found at : http://perldoc.perl.org/perlre.html
20 */
21 
22 #include "Riostream.h"
23 #include "TPRegexp.h"
24 #include "TObjArray.h"
25 #include "TObjString.h"
26 #include "TError.h"
27 
28 #ifdef R__WIN32
29 #define PCRE_STATIC
30 #endif
31 #include <pcre.h>
32 
33 #include <vector>
34 #include <stdexcept>
35 
36 struct PCREPriv_t {
37  pcre *fPCRE;
38  pcre_extra *fPCREExtra;
39 
40  PCREPriv_t() { fPCRE = 0; fPCREExtra = 0; }
41 };
42 
43 
45 
47 
48 ////////////////////////////////////////////////////////////////////////////////
49 /// Default ctor.
50 
52 {
53  fPriv = new PCREPriv_t;
54  fPCREOpts = 0;
55 }
56 
57 ////////////////////////////////////////////////////////////////////////////////
58 /// Create and initialize with pat.
59 
61 {
62  fPattern = pat;
63  fPriv = new PCREPriv_t;
64  fPCREOpts = 0;
65 }
66 
67 ////////////////////////////////////////////////////////////////////////////////
68 /// Copy ctor.
69 
71 {
72  fPattern = p.fPattern;
73  fPriv = new PCREPriv_t;
74  fPCREOpts = p.fPCREOpts;
75 }
76 
77 ////////////////////////////////////////////////////////////////////////////////
78 /// Cleanup.
79 
81 {
82  if (fPriv->fPCRE)
83  pcre_free(fPriv->fPCRE);
84  if (fPriv->fPCREExtra)
85  pcre_free(fPriv->fPCREExtra);
86  delete fPriv;
87 }
88 
89 ////////////////////////////////////////////////////////////////////////////////
90 /// Assignment operator.
91 
93 {
94  if (this != &p) {
95  fPattern = p.fPattern;
96  if (fPriv->fPCRE)
97  pcre_free(fPriv->fPCRE);
98  fPriv->fPCRE = 0;
99  if (fPriv->fPCREExtra)
100  pcre_free(fPriv->fPCREExtra);
101  fPriv->fPCREExtra = 0;
102  fPCREOpts = p.fPCREOpts;
103  }
104  return *this;
105 }
106 
107 ////////////////////////////////////////////////////////////////////////////////
108 /// Translate Perl modifier flags into pcre flags.
109 /// The supported modStr characters are: g, i, m, o, s, x, and the
110 /// special d for debug. The meaning of the letters is:
111 /// - m
112 /// Treat string as multiple lines. That is, change "^" and "$" from
113 /// matching the start or end of the string to matching the start or
114 /// end of any line anywhere within the string.
115 /// - s
116 /// Treat string as single line. That is, change "." to match any
117 /// character whatsoever, even a newline, which normally it would not match.
118 /// Used together, as /ms, they let the "." match any character whatsoever,
119 /// while still allowing "^" and "$" to match, respectively, just after and
120 /// just before newlines within the string.
121 /// - i
122 /// Do case-insensitive pattern matching.
123 /// - x
124 /// Extend your pattern's legibility by permitting whitespace and comments.
125 /// - p
126 /// Preserve the string matched such that ${^PREMATCH}, ${^MATCH},
127 /// and ${^POSTMATCH} are available for use after matching.
128 /// - g and c
129 /// Global matching, and keep the Current position after failed matching.
130 /// Unlike i, m, s and x, these two flags affect the way the regex is used
131 /// rather than the regex itself. See Using regular expressions in Perl in
132 /// perlretut for further explanation of the g and c modifiers.
133 /// For more detail see: http://perldoc.perl.org/perlre.html#Modifiers.
134 
135 UInt_t TPRegexp::ParseMods(const TString &modStr) const
136 {
137  UInt_t opts = 0;
138 
139  if (modStr.Length() <= 0)
140  return fPCREOpts;
141 
142  //translate perl flags into pcre flags
143  const char *m = modStr;
144  while (*m) {
145  switch (*m) {
146  case 'g':
147  opts |= kPCRE_GLOBAL;
148  break;
149  case 'i':
150  opts |= PCRE_CASELESS;
151  break;
152  case 'm':
153  opts |= PCRE_MULTILINE;
154  break;
155  case 'o':
156  opts |= kPCRE_OPTIMIZE;
157  break;
158  case 's':
159  opts |= PCRE_DOTALL;
160  break;
161  case 'x':
162  opts |= PCRE_EXTENDED;
163  break;
164  case 'd': // special flag to enable debug printing (not Perl compat.)
165  opts |= kPCRE_DEBUG_MSGS;
166  break;
167  default:
168  Error("ParseMods", "illegal pattern modifier: %c", *m);
169  opts = 0;
170  }
171  ++m;
172  }
173  return opts;
174 }
175 
176 ////////////////////////////////////////////////////////////////////////////////
177 /// Return PCRE modifier options as string.
178 /// For meaning of mods see ParseMods().
179 
181 {
182  TString ret;
183 
184  if (fPCREOpts & kPCRE_GLOBAL) ret += 'g';
185  if (fPCREOpts & PCRE_CASELESS) ret += 'i';
186  if (fPCREOpts & PCRE_MULTILINE) ret += 'm';
187  if (fPCREOpts & PCRE_DOTALL) ret += 's';
188  if (fPCREOpts & PCRE_EXTENDED) ret += 'x';
189  if (fPCREOpts & kPCRE_OPTIMIZE) ret += 'o';
190  if (fPCREOpts & kPCRE_DEBUG_MSGS) ret += 'd';
191 
192  return ret;
193 }
194 
195 ////////////////////////////////////////////////////////////////////////////////
196 /// Compile the fPattern.
197 
199 {
200  if (fPriv->fPCRE)
201  pcre_free(fPriv->fPCRE);
202 
204  Info("Compile", "PREGEX compiling %s", fPattern.Data());
205 
206  const char *errstr;
207  Int_t patIndex;
208  fPriv->fPCRE = pcre_compile(fPattern.Data(), fPCREOpts & kPCRE_INTMASK,
209  &errstr, &patIndex, 0);
210 
211  if (!fPriv->fPCRE) {
212  if (fgThrowAtCompileError) {
213  throw std::runtime_error
214  (TString::Format("TPRegexp::Compile() compilation of TPRegexp(%s) failed at: %d because %s",
215  fPattern.Data(), patIndex, errstr).Data());
216  } else {
217  Error("Compile", "compilation of TPRegexp(%s) failed at: %d because %s",
218  fPattern.Data(), patIndex, errstr);
219  return;
220  }
221  }
222 
223  if (fPriv->fPCREExtra || (fPCREOpts & kPCRE_OPTIMIZE))
224  Optimize();
225 }
226 
227 ////////////////////////////////////////////////////////////////////////////////
228 /// Send the pattern through the optimizer.
229 
231 {
232  if (fPriv->fPCREExtra)
233  pcre_free(fPriv->fPCREExtra);
234 
236  Info("Optimize", "PREGEX studying %s", fPattern.Data());
237 
238  const char *errstr;
239  // pcre_study allows less options - see pcre_internal.h PUBLIC_STUDY_OPTIONS.
240  fPriv->fPCREExtra = pcre_study(fPriv->fPCRE, 0, &errstr);
241 
242  if (!fPriv->fPCREExtra && errstr) {
243  Error("Optimize", "Optimization of TPRegexp(%s) failed: %s",
244  fPattern.Data(), errstr);
245  }
246 }
247 
248 ////////////////////////////////////////////////////////////////////////////////
249 /// Returns the number of expanded '$' constructs.
250 
252  const TString &replacePattern,
253  Int_t *offVec, Int_t nrMatch) const
254 {
255  Int_t nrSubs = 0;
256  const char *p = replacePattern;
257 
258  Int_t state = 0;
259  Int_t subnum = 0;
260  while (state != -1) {
261  switch (state) {
262  case 0:
263  if (!*p) {
264  state = -1;
265  break;
266  }
267  if (*p == '$') {
268  state = 1;
269  subnum = 0;
270  if (p[1] == '&') {
271  p++;
272  if (isdigit(p[1]))
273  p++;
274  } else if (!isdigit(p[1])) {
275  Error("ReplaceSubs", "badly formed replacement pattern: %s",
276  replacePattern.Data());
277  }
278  } else
279  final += *p;
280  break;
281  case 1:
282  if (isdigit(*p)) {
283  subnum *= 10;
284  subnum += (*p)-'0';
285  } else {
287  Info("ReplaceSubs", "PREGEX appending substr #%d", subnum);
288  if (subnum < 0 || subnum > nrMatch-1) {
289  Error("ReplaceSubs","bad string number: %d",subnum);
290  } else {
291  const TString subStr = s(offVec[2*subnum],offVec[2*subnum+1]-offVec[2*subnum]);
292  final += subStr;
293  nrSubs++;
294  }
295  state = 0;
296  continue; // send char to start state
297  }
298  }
299  p++;
300  }
301  return nrSubs;
302 }
303 
304 ////////////////////////////////////////////////////////////////////////////////
305 /// Perform the actual matching - protected method.
306 
308  Int_t nMaxMatch, TArrayI *pos) const
309 {
310  Int_t *offVec = new Int_t[3*nMaxMatch];
311  // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
312  Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
313  s.Length(), start, 0,
314  offVec, 3*nMaxMatch);
315 
316  if (nrMatch == PCRE_ERROR_NOMATCH)
317  nrMatch = 0;
318  else if (nrMatch <= 0) {
319  Error("Match","pcre_exec error = %d", nrMatch);
320  delete [] offVec;
321  return 0;
322  }
323 
324  if (pos)
325  pos->Set(2*nrMatch, offVec);
326  delete [] offVec;
327 
328  return nrMatch;
329 }
330 
331 ////////////////////////////////////////////////////////////////////////////////
332 /// The number of matches is returned, this equals the full match +
333 /// sub-pattern matches.
334 /// nMaxMatch is the maximum allowed number of matches.
335 /// pos contains the string indices of the matches. Its usage is
336 /// shown in the routine MatchS.
337 /// For meaning of mods see ParseMods().
338 
339 Int_t TPRegexp::Match(const TString &s, const TString &mods, Int_t start,
340  Int_t nMaxMatch, TArrayI *pos)
341 {
342  UInt_t opts = ParseMods(mods);
343 
344  if (!fPriv->fPCRE || opts != fPCREOpts) {
345  fPCREOpts = opts;
346  Compile();
347  }
348 
349  return MatchInternal(s, start, nMaxMatch, pos);
350 }
351 
352 
353 ////////////////////////////////////////////////////////////////////////////////
354 /// Returns a TObjArray of matched substrings as TObjString's.
355 /// The TObjArray is owner of the objects and must be deleted by the user.
356 /// The first entry is the full matched pattern, followed by the sub-patterns.
357 /// If a pattern was not matched, it will return an empty substring:
358 /// ~~~ {.cpp}
359 /// TObjArray *subStrL = TPRegexp("(a|(z))(bc)").MatchS("abc");
360 /// for (Int_t i = 0; i < subStrL->GetLast()+1; i++) {
361 /// const TString subStr = ((TObjString *)subStrL->At(i))->GetString();
362 /// std::cout << "\"" << subStr << "\" ";
363 /// }
364 /// std::cout << subStr << std::endl;
365 /// ~~~
366 /// produces: "abc" "a" "" "bc"
367 ///
368 /// For meaning of mods see ParseMods().
369 
371  Int_t start, Int_t nMaxMatch)
372 {
373  TArrayI pos;
374  Int_t nrMatch = Match(s, mods, start, nMaxMatch, &pos);
375 
376  TObjArray *subStrL = new TObjArray();
377  subStrL->SetOwner();
378 
379  for (Int_t i = 0; i < nrMatch; i++) {
380  Int_t startp = pos[2*i];
381  Int_t stopp = pos[2*i+1];
382  if (startp >= 0 && stopp >= 0) {
383  const TString subStr = s(pos[2*i], pos[2*i+1]-pos[2*i]);
384  subStrL->Add(new TObjString(subStr));
385  } else
386  subStrL->Add(new TObjString());
387  }
388 
389  return subStrL;
390 }
391 
392 ////////////////////////////////////////////////////////////////////////////////
393 /// Perform pattern substitution with optional back-ref replacement
394 /// - protected method.
395 
397  Int_t start, Int_t nMaxMatch,
398  Bool_t doDollarSubst) const
399 {
400  Int_t *offVec = new Int_t[3*nMaxMatch];
401 
402  TString final;
403  Int_t nrSubs = 0;
404  Int_t offset = start;
405  Int_t last = 0;
406 
407  while (kTRUE) {
408 
409  // find next matching subs
410  // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
411  Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
412  s.Length(), offset, 0,
413  offVec, 3*nMaxMatch);
414 
415  if (nrMatch == PCRE_ERROR_NOMATCH) {
416  nrMatch = 0;
417  break;
418  } else if (nrMatch <= 0) {
419  Error("Substitute", "pcre_exec error = %d", nrMatch);
420  break;
421  }
422 
423  // append anything previously unmatched, but not substituted
424  if (last <= offVec[0]) {
425  final += s(last,offVec[0]-last);
426  last = offVec[1];
427  }
428 
429  // replace stuff in s
430  if (doDollarSubst) {
431  ReplaceSubs(s, final, replacePattern, offVec, nrMatch);
432  } else {
433  final += replacePattern;
434  }
435  ++nrSubs;
436 
437  // if global gotta check match at every pos
438  if (!(fPCREOpts & kPCRE_GLOBAL))
439  break;
440 
441  if (offVec[0] != offVec[1])
442  offset = offVec[1];
443  else {
444  // matched empty string
445  if (offVec[1] == s.Length())
446  break;
447  offset = offVec[1]+1;
448  }
449  }
450 
451  delete [] offVec;
452 
453  final += s(last,s.Length()-last);
454  s = final;
455 
456  return nrSubs;
457 }
458 
459 ////////////////////////////////////////////////////////////////////////////////
460 /// Substitute replaces the string s by a new string in which matching
461 /// patterns are replaced by the replacePattern string. The number of
462 /// substitutions are returned.
463 /// ~~~ {.cpp}
464 /// TString s("aap noot mies");
465 /// const Int_t nrSub = TPRegexp("(\\w*) noot (\\w*)").Substitute(s,"$2 noot $1");
466 /// std::cout << nrSub << " \"" << s << "\"" <<std::endl;
467 /// ~~~
468 /// produces: 2 "mies noot aap"
469 ///
470 /// For meaning of mods see ParseMods().
471 
472 Int_t TPRegexp::Substitute(TString &s, const TString &replacePattern,
473  const TString &mods, Int_t start, Int_t nMaxMatch)
474 {
475  UInt_t opts = ParseMods(mods);
476 
477  if (!fPriv->fPCRE || opts != fPCREOpts) {
478  fPCREOpts = opts;
479  Compile();
480  }
481 
482  return SubstituteInternal(s, replacePattern, start, nMaxMatch, kTRUE);
483 }
484 
485 
486 ////////////////////////////////////////////////////////////////////////////////
487 /// Returns true if underlying PCRE structure has been successfully
488 /// generated via regexp compilation.
489 
491 {
492  return fPriv->fPCRE != 0;
493 }
494 
495 ////////////////////////////////////////////////////////////////////////////////
496 /// Get value of static flag controlling whether exception should be thrown upon an
497 /// error during regular expression compilation by the PCRE engine.
498 
500 {
501  return fgThrowAtCompileError;
502 }
503 
504 ////////////////////////////////////////////////////////////////////////////////
505 /// Set static flag controlling whether exception should be thrown upon an
506 /// error during regular expression compilation by the PCRE engine.
507 
509 {
510  fgThrowAtCompileError = throwp;
511 }
512 
513 ////////////////////////////////////////////////////////////////////////////////
514 // //
515 // TString member functions, put here so the linker will include //
516 // them only if regular expressions are used. //
517 // //
518 ////////////////////////////////////////////////////////////////////////////////
519 
520 ////////////////////////////////////////////////////////////////////////////////
521 /// Find the first occurrence of the regexp in string and return the position.
522 /// Start is the offset at which the search should start.
523 
525 {
526  TArrayI pos;
527  Int_t nrMatch = r.Match(*this,"",start,10,&pos);
528  if (nrMatch > 0)
529  return pos[0];
530  else
531  return -1;
532 }
533 
534 ////////////////////////////////////////////////////////////////////////////////
535 /// Find the first occurrence of the regexp in string and return the position.
536 /// Extent is length of the matched string and start is the offset at which
537 /// the matching should start.
538 
539 Ssiz_t TString::Index(TPRegexp& r, Ssiz_t* extent, Ssiz_t start) const
540 {
541  TArrayI pos;
542  const Int_t nrMatch = r.Match(*this,"",start,10,&pos);
543  if (nrMatch > 0) {
544  *extent = pos[1]-pos[0];
545  return pos[0];
546  } else {
547  *extent = 0;
548  return -1;
549  }
550 }
551 
552 ////////////////////////////////////////////////////////////////////////////////
553 /// Return the substring found by applying the regexp starting at start.
554 
556 {
557  Ssiz_t len;
558  Ssiz_t begin = Index(r, &len, start);
559  return TSubString(*this, begin, len);
560 }
561 
562 ////////////////////////////////////////////////////////////////////////////////
563 /// Return the substring found by applying the regexp.
564 
566 {
567  return (*this)(r, 0);
568 }
569 
570 
571 /** \class TPMERegexp
572 
573 Wrapper for PCRE library (Perl Compatible Regular Expressions).
574 Based on PME - PCRE Made Easy by Zachary Hansen.
575 
576 Supports main Perl operations using regular expressions (Match,
577 Substitute and Split). To retrieve the results one can simply use
578 operator[] returning a TString.
579 
580 See $ROOTSYS/tutorials/regexp_pme.C for examples.
581 */
582 
584 
585 ////////////////////////////////////////////////////////////////////////////////
586 /// Default constructor. This regexp will match an empty string.
587 
589  TPRegexp(),
590  fNMaxMatches(10),
591  fNMatches(0),
592  fAddressOfLastString(0),
593  fLastGlobalPosition(0)
594 {
595  Compile();
596 }
597 
598 ////////////////////////////////////////////////////////////////////////////////
599 /// Constructor.
600 ///
601 /// \param[in] s string to compile into regular expression
602 /// \param[in] opts perl-style character flags to be set on TPME object
603 /// \param[in] nMatchMax maximum number of matches
604 
605 TPMERegexp::TPMERegexp(const TString& s, const TString& opts, Int_t nMatchMax) :
606  TPRegexp(s),
607  fNMaxMatches(nMatchMax),
608  fNMatches(0),
609  fAddressOfLastString(0),
610  fLastGlobalPosition(0)
611 {
612  fPCREOpts = ParseMods(opts);
613  Compile();
614 }
615 
616 ////////////////////////////////////////////////////////////////////////////////
617 /// Constructor.
618 ///
619 /// \param[in] s string to compile into regular expression
620 /// \param[in] opts PCRE-style option flags to be set on TPME object
621 /// \param[in] nMatchMax maximum number of matches
622 
623 TPMERegexp::TPMERegexp(const TString& s, UInt_t opts, Int_t nMatchMax) :
624  TPRegexp(s),
625  fNMaxMatches(nMatchMax),
626  fNMatches(0),
627  fAddressOfLastString(0),
628  fLastGlobalPosition(0)
629 {
630  fPCREOpts = opts;
631  Compile();
632 }
633 
634 ////////////////////////////////////////////////////////////////////////////////
635 /// Copy constructor.
636 /// Only PCRE specifics are copied, not last-match or global-match
637 /// information.
638 
640  TPRegexp(r),
641  fNMaxMatches(r.fNMaxMatches),
642  fNMatches(0),
643  fAddressOfLastString(0),
644  fLastGlobalPosition(0)
645 {
646  Compile();
647 }
648 
649 ////////////////////////////////////////////////////////////////////////////////
650 /// Reset the pattern and options.
651 /// If 'nMatchMax' other than -1 (the default) is passed, it is also set.
652 
653 void TPMERegexp::Reset(const TString& s, const TString& opts, Int_t nMatchMax)
654 {
655  Reset(s, ParseMods(opts), nMatchMax);
656 }
657 
658 ////////////////////////////////////////////////////////////////////////////////
659 /// Reset the pattern and options.
660 /// If 'nMatchMax' other than -1 (the default) is passed, it is also set.
661 
662 void TPMERegexp::Reset(const TString& s, UInt_t opts, Int_t nMatchMax)
663 {
664  fPattern = s;
665  fPCREOpts = opts;
666  Compile();
667 
668  if (nMatchMax != -1)
669  fNMatches = nMatchMax;
670  fNMatches = 0;
672 }
673 
674 ////////////////////////////////////////////////////////////////////////////////
675 /// Copy global-match state from 're; so that this regexp can continue
676 /// parsing the string from where 're' left off.
677 ///
678 /// Alternatively, GetGlobalPosition() get be used to retrieve the
679 /// last match position so that it can passed to Match().
680 ///
681 /// Ideally, as it is done in PERL, the last match position would be
682 /// stored in the TString itself.
683 
685 {
688 }
689 
690 ////////////////////////////////////////////////////////////////////////////////
691 /// Reset state of global match.
692 /// This happens automatically when a new string is passed for matching.
693 /// But be carefull, as the address of last TString object is used
694 /// to make this decision.
695 
697 {
699 }
700 
701 ////////////////////////////////////////////////////////////////////////////////
702 /// Runs a match on s against the regex 'this' was created with.
703 ///
704 /// \param[in] s string to match against
705 /// \param[in] start offset at which to start matching
706 /// \return number of matches found
707 
709 {
710  // If we got a new string, reset the global position counter.
711  if (fAddressOfLastString != (void*) &s) {
713  }
714 
715  if (fPCREOpts & kPCRE_GLOBAL) {
716  start += fLastGlobalPosition;
717  }
718 
719  //fprintf(stderr, "string: '%s' length: %d offset: %d\n", s.Data(), s.length(), offset);
721 
722  //fprintf(stderr, "MatchInternal_exec result = %d\n", fNMatches);
723 
725  fAddressOfLastString = (void*) &s;
726 
727  if (fPCREOpts & kPCRE_GLOBAL) {
728  if (fNMatches == PCRE_ERROR_NOMATCH) {
729  // fprintf(stderr, "TPME RESETTING: reset for no match\n");
730  fLastGlobalPosition = 0; // reset the position for next match (perl does this)
731  } else if (fNMatches > 0) {
732  // fprintf(stderr, "TPME RESETTING: setting to %d\n", marks[0].second);
733  fLastGlobalPosition = fMarkers[1]; // set to the end of the match
734  } else {
735  // fprintf(stderr, "TPME RESETTING: reset for no unknown\n");
737  }
738  }
739 
740  return fNMatches;
741 }
742 
743 ////////////////////////////////////////////////////////////////////////////////
744 /// Splits into at most maxfields. If maxfields is unspecified or
745 /// 0, trailing empty matches are discarded. If maxfields is
746 /// positive, no more than maxfields fields will be returned and
747 /// trailing empty matches are preserved. If maxfields is empty,
748 /// all fields (including trailing empty ones) are returned. This
749 /// *should* be the same as the perl behaviour.
750 ///
751 /// If pattern produces sub-matches, these are also stored in
752 /// the result.
753 ///
754 /// A pattern matching the null string will split the value of EXPR
755 /// into separate characters at each point it matches that way.
756 ///
757 /// \param[in] s string to split
758 /// \param[in] maxfields maximum number of fields to be split out. 0 means
759 /// split all fields, but discard any trailing empty bits.
760 /// Negative means split all fields and keep trailing empty bits.
761 /// Positive means keep up to N fields including any empty fields
762 /// less than N. Anything remaining is in the last field.
763 /// \return number of fields found
764 
766 {
767  typedef std::pair<int, int> MarkerLoc_t;
768  typedef std::vector<MarkerLoc_t> MarkerLocVec_t;
769 
770  // stores the marks for the split
771  MarkerLocVec_t oMarks;
772 
773  // this is a list of current trailing empty matches if maxfields is
774  // unspecified or 0. If there is stuff in it and a non-empty match
775  // is found, then everything in here is pushed into oMarks and then
776  // the new match is pushed on. If the end of the string is reached
777  // and there are empty matches in here, they are discarded.
778  MarkerLocVec_t oCurrentTrailingEmpties;
779 
780  Int_t nOffset = 0;
781  Int_t nMatchesFound = 0;
782 
783  // while we are still finding matches and maxfields is 0 or negative
784  // (meaning we get all matches), or we haven't gotten to the number
785  // of specified matches
786  Int_t matchRes;
787  while ((matchRes = Match(s, nOffset)) &&
788  ((maxfields < 1) || nMatchesFound < maxfields)) {
789  ++nMatchesFound;
790 
791  if (fMarkers[1] - fMarkers[0] == 0) {
792  oMarks.push_back(MarkerLoc_t(nOffset, nOffset + 1));
793  ++nOffset;
794  if (nOffset >= s.Length())
795  break;
796  else
797  continue;
798  }
799 
800  // match can be empty
801  if (nOffset != fMarkers[0]) {
802  if (!oCurrentTrailingEmpties.empty()) {
803  oMarks.insert(oMarks.end(),
804  oCurrentTrailingEmpties.begin(),
805  oCurrentTrailingEmpties.end());
806  oCurrentTrailingEmpties.clear();
807  }
808  oMarks.push_back(MarkerLoc_t(nOffset, fMarkers[0]));
809  } else {
810  // empty match
811  if (maxfields == 0) {
812  // store for possible later inclusion
813  oCurrentTrailingEmpties.push_back(MarkerLoc_t(nOffset, nOffset));
814  } else {
815  oMarks.push_back(MarkerLoc_t(nOffset, nOffset));
816  }
817  }
818 
819  nOffset = fMarkers[1];
820 
821  if (matchRes > 1) {
822  for (Int_t i = 1; i < matchRes; ++i)
823  oMarks.push_back(MarkerLoc_t(fMarkers[2*i], fMarkers[2*i + 1]));
824  }
825  }
826 
827 
828  // if there were no matches found, push the whole thing on
829  if (nMatchesFound == 0) {
830  oMarks.push_back(MarkerLoc_t(0, s.Length()));
831  }
832  // if we ran out of matches, then append the rest of the string
833  // onto the end of the last split field
834  else if (maxfields > 0 && nMatchesFound >= maxfields) {
835  oMarks[oMarks.size() - 1].second = s.Length();
836  }
837  // else we have to add another entry for the end of the string
838  else {
839  Bool_t last_empty = (nOffset == s.Length());
840  if (!last_empty || maxfields < 0) {
841  if (!oCurrentTrailingEmpties.empty()) {
842  oMarks.insert(oMarks.end(),
843  oCurrentTrailingEmpties.begin(),
844  oCurrentTrailingEmpties.end());
845  }
846  oMarks.push_back(MarkerLoc_t(nOffset, s.Length()));
847  }
848  }
849 
850  fNMatches = oMarks.size();
852  for (Int_t i = 0; i < fNMatches; ++i) {
853  fMarkers[2*i] = oMarks[i].first;
854  fMarkers[2*i + 1] = oMarks[i].second;
855  }
856 
857  // fprintf(stderr, "match returning %d\n", fNMatches);
858  return fNMatches;
859 }
860 
861 ////////////////////////////////////////////////////////////////////////////////
862 /// Substitute matching part of s with r, dollar back-ref
863 /// substitution is performed if doDollarSubst is true (default).
864 /// Returns the number of substitutions made.
865 ///
866 /// After the substitution, another pass is made over the resulting
867 /// string and the following special tokens are interpreted:
868 /// - `\l` lowercase next char,
869 /// - `\u` uppercase next char,
870 /// - `\L` lowercase till `\E`,
871 /// - `\U` uppercase till `\E`, and
872 /// - `\E` end case modification.
873 
875 {
876  Int_t cnt = SubstituteInternal(s, r, 0, fNMaxMatches, doDollarSubst);
877 
878  TString ret;
879  Int_t state = 0;
880  Ssiz_t pos = 0, len = s.Length();
881  const Char_t *data = s.Data();
882  while (pos < len) {
883  Char_t c = data[pos];
884  if (c == '\\') {
885  c = data[pos+1]; // Rely on string-data being null-terminated.
886  switch (c) {
887  case 0 : ret += '\\'; break;
888  case 'l': state = 1; break;
889  case 'u': state = 2; break;
890  case 'L': state = 3; break;
891  case 'U': state = 4; break;
892  case 'E': state = 0; break;
893  default : ret += '\\'; ret += c; break;
894  }
895  pos += 2;
896  } else {
897  switch (state) {
898  case 0: ret += c; break;
899  case 1: ret += (Char_t) tolower(c); state = 0; break;
900  case 2: ret += (Char_t) toupper(c); state = 0; break;
901  case 3: ret += (Char_t) tolower(c); break;
902  case 4: ret += (Char_t) toupper(c); break;
903  default: Error("TPMERegexp::Substitute", "invalid state.");
904  }
905  ++pos;
906  }
907  }
908 
909  s = ret;
910 
911  return cnt;
912 }
913 
914 ////////////////////////////////////////////////////////////////////////////////
915 /// Returns the sub-string from the internal fMarkers vector.
916 /// Requires having run match or split first.
917 
919 {
920  if (index >= fNMatches)
921  return "";
922 
923  Int_t begin = fMarkers[2*index];
924  Int_t end = fMarkers[2*index + 1];
925  return fLastStringMatched(begin, end-begin);
926 }
927 
928 ////////////////////////////////////////////////////////////////////////////////
929 /// Print the regular expression and modifier options.
930 /// If 'option' contains "all", prints also last string match and
931 /// match results.
932 
934 {
935  TString opt = option;
936  opt.ToLower();
937 
938  Printf("Regexp='%s', Opts='%s'", fPattern.Data(), GetModifiers().Data());
939  if (opt.Contains("all")) {
940  Printf(" last string='%s'", fLastStringMatched.Data());
941  Printf(" number of matches = %d", fNMatches);
942  for (Int_t i=0; i<fNMatches; ++i)
943  Printf(" %d - %s", i, operator[](i).Data());
944  }
945 }
946 
947 
948 /** \class TStringToken
949 Provides iteration through tokens of a given string.
950 
951  - fFullStr stores the string to be split. It is never modified.
952  - fSplitRe is the perl-re that is used to separate the tokens.
953  - fReturnVoid if true, empty strings will be returned.
954 
955 Current token is stored in the TString base-class.
956 During construction no match is done, use NextToken() to get the first
957 and all subsequent tokens.
958 */
959 
961 
962 ////////////////////////////////////////////////////////////////////////////////
963 /// Constructor.
964 
965 TStringToken::TStringToken(const TString& fullStr, const TString& splitRe, Bool_t retVoid) :
966  fFullStr (fullStr),
967  fSplitRe (splitRe),
968  fReturnVoid (retVoid),
969  fPos (0)
970 {
971 }
972 
973 ////////////////////////////////////////////////////////////////////////////////
974 /// Get the next token, it is stored in this TString.
975 /// Returns true if new token is available, false otherwise.
976 
978 {
979  TArrayI x;
980  while (fPos < fFullStr.Length()) {
981  if (fSplitRe.Match(fFullStr, "", fPos, 2, &x)) {
983  fPos = x[1];
984  } else {
986  fPos = fFullStr.Length() + 1;
987  }
988  if (Length() || fReturnVoid)
989  return kTRUE;
990  }
991 
992  // Special case: void-strings are requested and the full-string
993  // ends with the separator. Thus we return another empty string.
994  if (fPos == fFullStr.Length() && fReturnVoid) {
995  TString::operator=("");
996  fPos = fFullStr.Length() + 1;
997  return kTRUE;
998  }
999 
1000  return kFALSE;
1001 }
A zero length substring is legal.
Definition: TString.h:77
Int_t SubstituteInternal(TString &s, const TString &replace, Int_t start, Int_t nMaxMatch0, Bool_t doDollarSubst) const
Perform pattern substitution with optional back-ref replacement.
Definition: TPRegexp.cxx:396
TString fLastStringMatched
Definition: TPRegexp.h:107
An array of TObjects.
Definition: TObjArray.h:37
Int_t fPos
Definition: TPRegexp.h:149
auto * m
Definition: textangle.C:8
Int_t fNMatches
Definition: TPRegexp.h:104
Int_t MatchInternal(const TString &s, Int_t start, Int_t nMaxMatch, TArrayI *pos=0) const
Perform the actual matching - protected method.
Definition: TPRegexp.cxx:307
Collectable string class.
Definition: TObjString.h:28
const char Option_t
Definition: RtypesCore.h:62
TArrayI fMarkers
Definition: TPRegexp.h:105
void AssignGlobalState(const TPMERegexp &re)
Copy global-match state from &#39;re; so that this regexp can continue parsing the string from where &#39;re&#39;...
Definition: TPRegexp.cxx:684
virtual void SetOwner(Bool_t enable=kTRUE)
Set whether this collection is the owner (enable==true) of its content.
Int_t ReplaceSubs(const TString &s, TString &final, const TString &replacePattern, Int_t *ovec, Int_t nmatch) const
Returns the number of expanded &#39;$&#39; constructs.
Definition: TPRegexp.cxx:251
TObjArray * MatchS(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10)
Returns a TObjArray of matched substrings as TObjString&#39;s.
Definition: TPRegexp.cxx:370
static void SetThrowAtCompileError(Bool_t throwp)
Set static flag controlling whether exception should be thrown upon an error during regular expressio...
Definition: TPRegexp.cxx:508
virtual ~TPRegexp()
Cleanup.
Definition: TPRegexp.cxx:80
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:634
Basic string class.
Definition: TString.h:131
TString GetModifiers() const
Return PCRE modifier options as string.
Definition: TPRegexp.cxx:180
void ToLower()
Change string to lower-case.
Definition: TString.cxx:1125
int Int_t
Definition: RtypesCore.h:41
TString & operator=(char s)
Assign character c to TString.
Definition: TString.cxx:267
bool Bool_t
Definition: RtypesCore.h:59
Int_t Substitute(TString &s, const TString &replace, const TString &mods="", Int_t start=0, Int_t nMatchMax=10)
Substitute replaces the string s by a new string in which matching patterns are replaced by the repla...
Definition: TPRegexp.cxx:472
TPRegexp fSplitRe
Definition: TPRegexp.h:147
Array of integers (32 bits per element).
Definition: TArrayI.h:27
PCREPriv_t * fPriv
Definition: TPRegexp.h:47
Int_t fNMaxMatches
Definition: TPRegexp.h:103
Double_t x[n]
Definition: legend1.C:17
TStringToken(const TString &fullStr, const TString &splitRe, Bool_t retVoid=kFALSE)
Constructor.
Definition: TPRegexp.cxx:965
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2311
Provides iteration through tokens of a given string.
Definition: TPRegexp.h:143
void Compile()
Compile the fPattern.
Definition: TPRegexp.cxx:198
void Info(const char *location, const char *msgfmt,...)
static constexpr double s
Bool_t IsValid() const
Returns true if underlying PCRE structure has been successfully generated via regexp compilation...
Definition: TPRegexp.cxx:490
void Set(Int_t n)
Set size of this array to n ints.
Definition: TArrayI.cxx:105
void Error(const char *location, const char *msgfmt,...)
char & operator()(Ssiz_t i)
Definition: TString.h:709
ROOT::R::TRInterface & r
Definition: Object.C:4
Int_t Split(const TString &s, Int_t maxfields=0)
Splits into at most maxfields.
Definition: TPRegexp.cxx:765
void * fAddressOfLastString
Definition: TPRegexp.h:108
TPRegexp & operator=(const TPRegexp &p)
Assignment operator.
Definition: TPRegexp.cxx:92
unsigned int UInt_t
Definition: RtypesCore.h:42
virtual void Print(Option_t *option="")
Print the regular expression and modifier options.
Definition: TPRegexp.cxx:933
Ssiz_t Length() const
Definition: TString.h:405
const Bool_t kFALSE
Definition: RtypesCore.h:88
int Ssiz_t
Definition: RtypesCore.h:63
#define ClassImp(name)
Definition: Rtypes.h:365
UInt_t fPCREOpts
Definition: TPRegexp.h:48
TPMERegexp()
Default constructor. This regexp will match an empty string.
Definition: TPRegexp.cxx:588
void Printf(const char *fmt,...)
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:619
Bool_t NextToken()
Get the next token, it is stored in this TString.
Definition: TPRegexp.cxx:977
static Bool_t fgThrowAtCompileError
Definition: TPRegexp.h:50
UInt_t ParseMods(const TString &mods) const
Translate Perl modifier flags into pcre flags.
Definition: TPRegexp.cxx:135
static Bool_t GetThrowAtCompileError()
Get value of static flag controlling whether exception should be thrown upon an error during regular ...
Definition: TPRegexp.cxx:499
Int_t Match(const TString &s, UInt_t start=0)
Runs a match on s against the regex &#39;this&#39; was created with.
Definition: TPRegexp.cxx:708
char Char_t
Definition: RtypesCore.h:29
const TString fFullStr
Definition: TPRegexp.h:146
Wrapper for PCRE library (Perl Compatible Regular Expressions).
Definition: TPRegexp.h:97
Int_t fLastGlobalPosition
Definition: TPRegexp.h:110
Bool_t fReturnVoid
Definition: TPRegexp.h:148
#define c(i)
Definition: RSha256.hxx:101
void Add(TObject *obj)
Definition: TObjArray.h:74
void Reset(const TString &s, const TString &opts="", Int_t nMatchMax=-1)
Reset the pattern and options.
Definition: TPRegexp.cxx:653
Int_t Substitute(TString &s, const TString &r, Bool_t doDollarSubst=kTRUE)
Substitute matching part of s with r, dollar back-ref substitution is performed if doDollarSubst is t...
Definition: TPRegexp.cxx:874
friend class TSubString
Definition: TString.h:134
Int_t Match(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10, TArrayI *pos=0)
The number of matches is returned, this equals the full match + sub-pattern matches.
Definition: TPRegexp.cxx:339
const Bool_t kTRUE
Definition: RtypesCore.h:87
TPRegexp()
Default ctor.
Definition: TPRegexp.cxx:51
TString operator[](Int_t)
Returns the sub-string from the internal fMarkers vector.
Definition: TPRegexp.cxx:918
const char * cnt
Definition: TXMLSetup.cxx:74
TString fPattern
Definition: TPRegexp.h:46
void Optimize()
Send the pattern through the optimizer.
Definition: TPRegexp.cxx:230
const char * Data() const
Definition: TString.h:364
void ResetGlobalState()
Reset state of global match.
Definition: TPRegexp.cxx:696