Logo ROOT   6.10/09
Reference Guide
regexp.C
Go to the documentation of this file.
1 /// \file
2 /// \ingroup Tutorials
3 /// A regular expression, often called a pattern, is an expression that describes a set of
4 /// strings. They are usually used to give a concise description of a set, without having to
5 /// list all elements.
6 /// The Unix utilities like sed and grep make extensive use of regular expressions. Scripting
7 /// languages like Perl have regular expression engines built directly into their syntax .
8 ///
9 /// Extensive documentation about Regular expressions in Perl can be
10 /// found at: http://perldoc.perl.org/perlre.html
11 ///
12 /// ROOT has this capability through the use of the P(erl) C(ompatible) R(egular) E(xpression)
13 /// - library, PCRE, see http://www.pcre.org
14 ///
15 /// Its functionality can be accessed through the TPRegexp and TString class .
16 /// Note that in patterns taken from Perl all backslash character have to be replaced in the
17 /// C/C++ strings by two backslashes .
18 ///
19 /// This macro shows several ways how to use the Match/Substitute capabilities of the
20 /// the TPRegexp class . It can be run as follows :
21 /// ~~~
22 /// .x regexp.C
23 /// ~~~
24 ///
25 /// \macro_output
26 /// \macro_code
27 ///
28 /// \author Eddy Offermann
29 
30 #include "Riostream.h"
31 #include "TString.h"
32 #include "TPRegexp.h"
33 #include "TClonesArray.h"
34 #include "TObjString.h"
35 
36 
37 
38 void regexp()
39 {
40  // Substitute example :
41  // Find a word that starts with "peper" and ends with "koek" .
42 
43  TString s1("lekkere pepernotenkoek");
44  TPRegexp r1("\\bpeper(\\w+)koek\\b");
45 
46  // Note that the TString class gives access to some of the simpler TPRegexp functionality .
47  // The following command returns the fully matched string .
48  cout << s1(r1) << endl;
49 
50  // In the "Substitute" command, keep the middle part (indicated in the regexp by "(\\w+)"
51  // and the substitute string by "$1") and sandwich it between "wal" and "boom" .
52  r1.Substitute(s1,"wal$1boom");
53  cout << s1 << endl;
54 
55  // Substitute example :
56  // Swap first two words in a string
57 
58  TString s2("one two three");
59  TPRegexp("^([^ ]+) +([^ ]+)").Substitute(s2,"$2 $1");
60  cout << s2 << endl;
61 
62  // Substitute example :
63  // $1, $2, and so on, in the substitute string are equivalent to whatever the corresponding set
64  // of parentheses match in the regexp string, counting opening parentheses from left to right .
65  // In the following example, we are trying to catch a date MMDDYYYY in a string and rearrange
66  // it to DDMMYYY . "(\\d{1,2}) matches only 1 or 2 digits etc .
67 
68  TString s3("on 09/24/1959 the world stood still");
69  TPRegexp("\\b(\\d{1,2})/(\\d{1,2})/(\\d{4})\\b").Substitute(s3,"$2-$1-$3");
70  cout << s3 << endl;
71 
72  // Match Example :
73  // The following example shows how to extract a protocol and port number from an URL string .
74  // Note again the parentheses in the regexp string : "(\\w+)" requires a non-empty
75  // alphanumeric string while "(\\d+)" wants a pure digital string .
76  // The matched substrings together with the full matched string are returned in a
77  // TObjArray . The first entry is the full string while next entries are the substrings
78  // in the order as listed in the regexp string .
79  //
80  // Note that there is also a Match(..) command that returns the positions of the
81  // substrings in the input string .
82 
83  TString s4("http://fink.sourceforge.net:8080/index/readme.html");
84  TObjArray *subStrL = TPRegexp("^(\\w+)://[^/]+:(\\d+)/$").MatchS(s4);
85  const Int_t nrSubStr = subStrL->GetLast()+1;
86  if (nrSubStr > 2) {
87  const TString proto = ((TObjString *)subStrL->At(1))->GetString();
88  const TString port = ((TObjString *)subStrL->At(2))->GetString();
89  cout << "protocol: " << proto << " port: " << port << endl;
90  }
91 
92  // Match Example :
93  // This example returns kTRUE if the email address is valid . For that it has to fulfil the following
94  // criteria:
95  // 1) It should be of the form string1@string2 . The "^" and "$" ensure that we compare the complete
96  // email string
97  // 2) ([\\w-\\.]+) :
98  // string1 is only allowed to be composed out of the alphanumeric characters, "-" and "." .
99  // The "+" ensures that string1 can not be empty .
100  // 3) string2 is matched against three different parts :
101  // a. ((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+)) :
102  // This regular expression ensures that EITHER the string starts with "[" followed by three groups
103  // of numbers, separated by "." , where each group has 1 to 3 numbers, OR alphanumeric strings,
104  // possibly containing "-" characters, separated by "." .
105  // b. ([a-zA-Z]{2,4}|[0-9]{1,3}) :
106  // This part contains EITHER 2 to 4 alpha characters OR 1 to 3 numbers
107  // c. (\\]?) :
108  // At most one "]" character .
109 
110  TString s5("fons.rademakers@cern.ch");
111  TPRegexp r5("^([\\w-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$");
112  cout << "Check if the email address \"" << s5 << "\" is valid: " << (r5.MatchB(s5) ? "TRUE" : "FALSE") << endl;
113 
114  // Substitute Example with pattern modifier :
115  // Like in Perl, Substitute/Match commands accept modifier arguments . For instance a "g" modifier causes to
116  // match the regexp globally . In the example below, all words starting and ending with the character "n"
117  // are replaced by the word neutrino .
118 
119  TString s6("neutron proton electron neutron");
120  TPRegexp("(n\\w+n)").Substitute(s6,"neutrino","g");
121  cout << s6 << endl;
122 }
An array of TObjects.
Definition: TObjArray.h:37
Collectable string class.
Definition: TObjString.h:28
TObjArray * MatchS(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10)
Returns a TObjArray of matched substrings as TObjString&#39;s.
Definition: TPRegexp.cxx:367
Basic string class.
Definition: TString.h:129
int Int_t
Definition: RtypesCore.h:41
Int_t Substitute(TString &s, const TString &replace, const TString &mods="", Int_t start=0, Int_t nMatchMax=10)
Substitute replaces the string s by a new string in which matching patterns are replaced by the repla...
Definition: TPRegexp.cxx:469
TObject * At(Int_t idx) const
Definition: TObjArray.h:165
Int_t GetLast() const
Return index of last object in array.
Definition: TObjArray.cxx:528
unsigned int r1[N_CITIES]
Definition: simanTSP.cxx:321
const char * proto
Definition: civetweb.c:11652