Logo ROOT   6.18/05
Reference Guide
regexp.C
Go to the documentation of this file.
1/// \file
2/// \ingroup Tutorials
3/// A regular expression, often called a pattern, is an expression that describes a set of
4/// strings. They are usually used to give a concise description of a set, without having to
5/// list all elements.
6/// The Unix utilities like sed and grep make extensive use of regular expressions. Scripting
7/// languages like Perl have regular expression engines built directly into their syntax .
8///
9/// Extensive documentation about Regular expressions in Perl can be
10/// found at: http://perldoc.perl.org/perlre.html
11///
12/// ROOT has this capability through the use of the P(erl) C(ompatible) R(egular) E(xpression)
13/// - library, PCRE, see http://www.pcre.org
14///
15/// Its functionality can be accessed through the TPRegexp and TString class .
16/// Note that in patterns taken from Perl all backslash character have to be replaced in the
17/// C/C++ strings by two backslashes .
18///
19/// This macro shows several ways how to use the Match/Substitute capabilities of the
20/// the TPRegexp class . It can be run as follows :
21/// ~~~
22/// .x regexp.C
23/// ~~~
24///
25/// \macro_output
26/// \macro_code
27///
28/// \author Eddy Offermann
29
30#include "Riostream.h"
31#include "TString.h"
32#include "TPRegexp.h"
33#include "TClonesArray.h"
34#include "TObjString.h"
35
36
37
38void regexp()
39{
40 // Substitute example :
41 // Find a word that starts with "peper" and ends with "koek" .
42
43 TString s1("lekkere pepernotenkoek");
44 TPRegexp r1("\\bpeper(\\w+)koek\\b");
45
46 // Note that the TString class gives access to some of the simpler TPRegexp functionality .
47 // The following command returns the fully matched string .
48 cout << s1(r1) << endl;
49
50 // In the "Substitute" command, keep the middle part (indicated in the regexp by "(\\w+)"
51 // and the substitute string by "$1") and sandwich it between "wal" and "boom" .
52 r1.Substitute(s1,"wal$1boom");
53 cout << s1 << endl;
54
55 // Substitute example :
56 // Swap first two words in a string
57
58 TString s2("one two three");
59 TPRegexp("^([^ ]+) +([^ ]+)").Substitute(s2,"$2 $1");
60 cout << s2 << endl;
61
62 // Substitute example :
63 // $1, $2, and so on, in the substitute string are equivalent to whatever the corresponding set
64 // of parentheses match in the regexp string, counting opening parentheses from left to right .
65 // In the following example, we are trying to catch a date MMDDYYYY in a string and rearrange
66 // it to DDMMYYY . "(\\d{1,2}) matches only 1 or 2 digits etc .
67
68 TString s3("on 09/24/1959 the world stood still");
69 TPRegexp("\\b(\\d{1,2})/(\\d{1,2})/(\\d{4})\\b").Substitute(s3,"$2-$1-$3");
70 cout << s3 << endl;
71
72 // Match Example :
73 // The following example shows how to extract a protocol and port number from an URL string .
74 // Note again the parentheses in the regexp string : "(\\w+)" requires a non-empty
75 // alphanumeric string while "(\\d+)" wants a pure digital string .
76 // The matched substrings together with the full matched string are returned in a
77 // TObjArray . The first entry is the full string while next entries are the substrings
78 // in the order as listed in the regexp string .
79 //
80 // Note that there is also a Match(..) command that returns the positions of the
81 // substrings in the input string .
82
83 TString s4("http://fink.sourceforge.net:8080/index/readme.html");
84 TObjArray *subStrL = TPRegexp("^(\\w+)://[^/]+:(\\d+)/$").MatchS(s4);
85 const Int_t nrSubStr = subStrL->GetLast()+1;
86 if (nrSubStr > 2) {
87 const TString proto = ((TObjString *)subStrL->At(1))->GetString();
88 const TString port = ((TObjString *)subStrL->At(2))->GetString();
89 cout << "protocol: " << proto << " port: " << port << endl;
90 }
91
92 // Match Example :
93 // This example returns kTRUE if the email address is valid . For that it has to fulfil the following
94 // criteria:
95 // 1) It should be of the form string1@string2 . The "^" and "$" ensure that we compare the complete
96 // email string
97 // 2) ([\\w-\\.]+) :
98 // string1 is only allowed to be composed out of the alphanumeric characters, "-" and "." .
99 // The "+" ensures that string1 can not be empty .
100 // 3) string2 is matched against three different parts :
101 // a. ((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+)) :
102 // This regular expression ensures that EITHER the string starts with "[" followed by three groups
103 // of numbers, separated by "." , where each group has 1 to 3 numbers, OR alphanumeric strings,
104 // possibly containing "-" characters, separated by "." .
105 // b. ([a-zA-Z]{2,4}|[0-9]{1,3}) :
106 // This part contains EITHER 2 to 4 alpha characters OR 1 to 3 numbers
107 // c. (\\]?) :
108 // At most one "]" character .
109
110 TString s5("fons.rademakers@cern.ch");
111 TPRegexp r5("^([\\w-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$");
112 cout << "Check if the email address \"" << s5 << "\" is valid: " << (r5.MatchB(s5) ? "TRUE" : "FALSE") << endl;
113
114 // Substitute Example with pattern modifier :
115 // Like in Perl, Substitute/Match commands accept modifier arguments . For instance a "g" modifier causes to
116 // match the regexp globally . In the example below, all words starting and ending with the character "n"
117 // are replaced by the word neutrino .
118
119 TString s6("neutron proton electron neutron");
120 TPRegexp("(n\\w+n)").Substitute(s6,"neutrino","g");
121 cout << s6 << endl;
122}
#define s1(x)
Definition: RSha256.hxx:91
int Int_t
Definition: RtypesCore.h:41
const char * proto
Definition: civetweb.c:16604
An array of TObjects.
Definition: TObjArray.h:37
Int_t GetLast() const
Return index of last object in array.
Definition: TObjArray.cxx:576
TObject * At(Int_t idx) const
Definition: TObjArray.h:166
Collectable string class.
Definition: TObjString.h:28
TObjArray * MatchS(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10)
Returns a TObjArray of matched substrings as TObjString's.
Definition: TPRegexp.cxx:370
Int_t Substitute(TString &s, const TString &replace, const TString &mods="", Int_t start=0, Int_t nMatchMax=10)
Substitute replaces the string s by a new string in which matching patterns are replaced by the repla...
Definition: TPRegexp.cxx:472
Basic string class.
Definition: TString.h:131