Perl regular expressions (PRX) consists of metacharacters which are characters and special characters in essence. SAS searches a source string for substring that matches the Perl regular expression. For example, if you use the metacharacter \d, SAS matches a digit between 0-9. If you use /\dt/, SAS find the digits in the string “Raleigh, NC 27506”. Tables of Perl Regular Expression (PRX) Metacharacters can be found here.

PRXMATCH function can be used to find the position of a matchedd value in a source string. PRXCHANGE function can perform substitution. Syntax for PRXMATCH is /search-string/source-string/ and PRXCHANGE is s/regular-expression/replacement-string/. Paired forward slashes are the default delimiters. Below examples shows how to find postion of first Unicode string in source string and replace all unicode strings with Capital letter U. To match characters like {}[]()^$.|*+?\, escaper character “\” should be placed before these chacters.

PRXMATCH and PRXCHANGE Function

26             data test;
27                        text1 = “x (*ESC*){unicode ‘2265’x} 2 and y (*ESC*){unicode ‘2264’x} 2”;
28                        pos1 = prxmatch(‘/\(\*ESC\*\)\{unicode\s\D\d{4}\D{2}\}/’,text1);
29                        put ‘Result of prxmatch: ‘ pos1;
30                         sub1 = prxchange(‘s/\(\*ESC\*\)\{unicode\s\D\d{4}\D{2}\}/U/’,-1,text1);
31                         put ‘Result of prxchange: ‘ sub1;
32             run;

Result of prxmatch: 3
Result of prxchange: x U 2 and y U 2

This examples shows how to find postion of first superscript string in source string and replace all superscript strings with Capital letter S.

34             data test2;
35                         text1 = “x(*ESC*){sup ‘2’} + y(*ESC*){sup ‘2’} = z(*ESC*){sup ‘2’}”;
36                         pos2 = prxmatch(‘/\(\*ESC\*\)\{sup\s\D\w+?\D\}/’,text1);
37                         put ‘Result of prxmatch: ‘ pos2;
38                         sub2 = prxchange(‘s/\(\*ESC\*\)\{sup\s\D\w+?\D\}/S/’,-1,text1);
39                         put ‘Result of prxchange: ‘ sub2;
40             run;

Result of prxmatch: 2
Result of prxchange: xS + yS = zS

PRXPARSE, PRXMATCH Function and CALL PRXSUBSTR Routine

PRXPARSE can be used to compile a perl regular expression (PRX) for pattern matching of charcater value. Syntax is regular-expression id = PRXPARSE(perl-regular-expression). Returned pattern identifier number can be used by other Perl functions and CALL routines to match patterns.

Besides PRXMATCH, PRXSUBSTR can also be used to return position of a substring that matches a pattern. Plus, it also returns the length of matched string.  But these two functions can only only return postion of first string matched the pattern.

26             data test;
27                         text1 = “x (*ESC*){unicode ‘2265’x} 2 and y (*ESC*){unicode ‘2264’x} 2”;
28                         patternID = prxparse(‘/\(\*ESC\*\)\{unicode\s\D\d{4}\D{2}\}/’);
29                         pos1 = prxmatch(patternID,text1);
30                         put ‘Result of prxmatch: ‘ pos1;
31                         call prxsubstr(patternID, text1, position, length);
32                         put ‘Result of prxsubstr: ‘ position =  length = ;
33             run;

Result of prxmatch: 3
Result of prxsubstr: position=3 length=24

PRXPARSE and CALL PRXNEXT Routine

Function PRXNEXT can return the position and length of a substring matching a pattern. Moreover, it can iterate over multiple matches within one source string. Syntax is CALL PRXNEXT(regular-expression-id, start, stop, source, position, length).

regular-expression-id
specifies a numeric variable with a value that is the identification number that is returned by the PRXPARSE function.

start
is a numeric variable that specifies the position at which to start the pattern matching in source. If the match is successful, CALL PRXNEXT returns a value of position + MAX(1, length). If the match is not successful, the value of start is not changed.

stop
is a numeric constant, variable, or expression that specifies the last character to use in source. If stop is –1, the last character is the last non-blank character in source.

source
specifies a character constant, variable, or expression that you want to search.

position
is a numeric variable with a returned value that is the position in source at which the pattern begins. If no match is found, CALL PRXNEXT returns 0.

length
is a numeric variable with a returned value that is the length of the string that is matched by the pattern. If no match is found, CALL PRXNEXT returns 0.

26             data test;
27                         text1 = “x (*ESC*){unicode ‘2265’x} 2 and y (*ESC*){unicode ‘2264’x} 2”;
28                         patternID = prxparse(‘/\(\*ESC\*\)\{unicode\s\D\d{4}\D{2}\}/i’);
29                         start = 1;
30                         stop = length(text1);
31                         call prxnext(patternID, start, stop, text1, position, length);
32                         do while (position > 0);
33                                      found = substr(text1, position, length);
34                                      put ‘Result of prxnext: ‘ found =  position =  length = ;
35                                      call prxnext(patternID, start, stop, text1, position, length);
36                         end;
37             run;

Result of prxnext: found=(*ESC*){unicode ‘2265’x} position=3 length=24
Result of prxnext: found=(*ESC*){unicode ‘2264’x} position=36 length=24

PRXPARSE and PRXPOSN Function

Function PRXPOSN can return a character string that contains the value of a capture buffer. Syntax is PRXPOSN(regular-expression-id, capture-buffer, source). Regular-expression-id is a pattern identifier returned by PRXPARSE function. Capture-buffer is a numeric constant, variable or expression that identifies the capture buffer for which to retrieve a value.

If it is zero, PRXPOSN returns the entire match.
If it is between 1 and the number of open parentheses in regular expression, PRXPOSN returns the value for that capture buffer.
If it is greater than the number of open parentheses, PRXPOSN returns a missing value.

*Each paired parenthesis in regular-expression represents a buffer.

26             data _null;
27                                      retain re;
28                                      re = prxparse(‘/(\D+)\s(\D+)\s(\(\d{3}?\)\d{3}?-\d{4}?)/’);
29                                      input string $ 1-27;
30                                      if prxmatch(re, string) then do;
31                                      first = prxposn(re, 1, string);
32                                      put _n_ ‘ First Name: ‘ first;
33                                      last = prxposn(re, 2, string);
34                                      put _n_ ‘ Last Name: ‘ last;
35                                      phone = prxposn(re, 3, string);
36                                      put _n_ ‘ Telephone: ‘ phone;
37                                      end;
datalines;
Thomas Archer (919)319-1677
Lucy Barr (800)899-2164
Tom Joad (508)852-2146
Laurie Gil (252)352-7583
;
run;

1         First Name: Thomas
1         Last Name: Archer
1         Telephone: (919)319-1677
2         First Name: Lucy
2         Last Name: Barr
2         Telephone: (800)899-2164
3         First Name: Tom
3         Last Name: Joad
3         Telephone: (508)852-2146
4         First Name: Laurie
4         Last Name: Gil
4         Telephone: (252)352-7583

Macro to Remove Quoted Strings from Macro Variable

26         %macro split(in_var = , delim = );
27                  %let n = %sysfunc(countc(&in_var, &delim));
28                  %put &n;
29                  %put &delim;
30                  %global out_var;
31                  %let out_var = ;
32                  %do i = 1 %to %eval(&n+1);
33                                    %let var_ = %qsysfunc(scan(&in_var,&i, &delim));
34                                    %if &i = 1 %then %let split&i =  %qsysfunc(prxchange(s/”(.+)”/ /,-1,&var_));
35                                    %else %let split&i = %qsysfunc(prxchange(s/”(.+)”/ /,-1,&var_));
36                                    %put &&split&i;
37                                    %if &&split&i ne %then %let out_var = &out_var &&split&i;
38                                    %else %let out_var = &out_var;
39                  %end;
40         %mend;

41         %let rpt_vars = rptpage _blokord_ _subord_ _level_ _blokcol_ (“\qc 200 mg Q3W” (“\qc China” colval1 colval2 colval3)  (“\qc Non-China” colval4) colval5) colval6 ;
42
43          %split(in_var = &rpt_vars, delim = %nrstr(%());
44          %let var1 = &out_var;
45          %split(in_var = &var1, delim = %nrstr(%)));
47          %put &out_var;
rptpage  _blokord_  _subord_  _level_  _blokcol_   colval1 colval2 colval3  colval4  colval5  colval6