现在的位置: 首页 > 综合 > 正文

Some examples about perl expression

2018年10月23日 ⁄ 综合 ⁄ 共 7129字 ⁄ 字号 评论关闭

详见:http://www2.sas.com/proceedings/sugi29/265-29.pdf

 

***Primary functions: PRXPARSE, PRXMATCH;
/*Program 1: Using a Perl regular expression to locate lines with an exact text match*/
DATA _NULL_;
TITLE "Perl Regular Expression Tutorial – Program 1";
IF _N_ = 1 THEN PATTERN_NUM = PRXPARSE("/cat/");/*prxparse:定义正则表达*/
RETAIN PATTERN_NUM;
INPUT STRING $30.;
POSITION = PRXMATCH(PATTERN_NUM,STRING);/*prxmatch:定义文本形式*/
FILE PRINT;
PUT PATTERN_NUM= STRING= POSITION=;
DATALINES;
There is a cat in this line.
Does not match CAT
cat in the beginning
At the end, a cat
cat
;

/*Program 2: Using a regular expression to search for phone numbers in a string*/
***Primary functions: PRXPARSE, PRXMATCH;
DATA PHONE;
IF _N_ = 1 THEN PATTERN = PRXPARSE("/\(\d\d\d\) ?\d\d\d-\d{4}/");
***Regular expression will match any phone number in the form:
(nnn)nnn-nnnn or (nnn) nnn-nnnn.;
/*
\( matches a left parenthesis
\d\d\d matches any three digits
(blank)? matches zero or one blank
\d\d\d matches any three digits
- matches a dash
\d{4} matches any four digits
*/
RETAIN PATTERN;
INPUT STRING $CHAR40.;
IF PRXMATCH(PATTERN,STRING) GT 0 THEN OUTPUT;
DATALINES;
One number (123)333-4444
Two here:(800)234-2222 and (908) 444-2344
None here
;
PROC PRINT DATA=PHONE NOOBS;
TITLE "Listing of Data Set Phone";
RUN;

/*Program 3: Modifying Program 2 to search for toll-free phone numbers*/
***Primary functions: PRXPARSE, PRXMATCH
***Other function: MISSING;
DATA TOLL_FREE;
IF _N_ = 1 THEN DO
RE = PRXPARSE("/\(8(00|77|87)\) ?\d\d\d-\d{4}\b/");
***Regular expression looks for phone numbers of the form:
(nnn)nnn-nnnn or (nnn) nnn-nnnn. In addition the first
digit of the area code must be an 8 and the next two
digits must be either a 00, 77, or 87.;
IF MISSING(RE) THEN DO;
PUT "ERROR IN COMPILING REGULAR EXPRESSION";
STOP;
END;
END;
RETAIN RE;
INPUT STRING $CHAR80.;
POSITION = PRXMATCH(RE,STRING);
IF POSITION GT 0 THEN OUTPUT;
DATALINES;
One number on this line (877)234-8765
No numbers here
One toll free, one not:(908)782-6354 and (800)876-3333 xxx
Two toll free:(800)282-3454 and (887) 858-1234
No toll free here (609)848-9999 and (908) 345-2222
;
PROC PRINT DATA=TOLL_FREE NOOBS;
TITLE "Listing of Data Set TOLL_FREE";
RUN;

/*Program 4: Using PRXMATCH without PRXPARSE (entering the regular expression directly in the function)*/
***Primary functions: PRXMATCH;
DATA MATCH_IT;
INPUT @1 STRING $20.;
POSITION = PRXMATCH("/\d\d\d/",STRING);
DATALINES;
LINE 345 IS HERE
NONE HERE
ABC1234567
;
PROC PRINT DATA=MATCH_IT NOOBS;
TITLE "Listing of Data Set MATCH_IT";
RUN;

/*Program 5: Locating all 5- or 9-digit zip codes in a list of addresses*/
***Primary functions: PRXPARSE and PRXSUBSTR
***Other functions: SUBSTRN;
DATA ZIPCODE;
IF _N_ = 1 THEN RE = PRXPARSE("/ \d{5}(-\d{4})?/");
RETAIN RE;
/*
Match a blank followed by 5 digits followed by
either nothing or a dash and 4 digits
\d{5} matches 5 digits
- matches a dash
\d{4} matches 4 digits
? matches zero of one of the preceding subexpression
*/
INPUT STRING $80.;
LENGTH ZIP_CODE $ 10;
CALL PRXSUBSTR(RE,STRING,START,LENGTH);
IF START GT 0 THEN DO;
ZIP_CODE = SUBSTRN(STRING,START + 1,LENGTH - 1);
OUTPUT;
END;
KEEP ZIP_CODE;
DATALINES;
John Smith
12 Broad Street
Flemington, NJ 08822
Philip Judson
Apt #1, Building 7
777 Route 730
Kerrville, TX 78028
Dr. Roger Alan
44 Commonwealth Ave.
Boston, MA 02116-7364
;
PROC PRINT DATA=ZIPCODE NOOBS;
TITLE "Listing of Data Set ZIPCODE";
RUN;

/*Program 6: Extracting a phone number from a text string*/
***Primary functions: PRXPARSE, PRXSUBSTR
***Other functions: SUBSTR, COMPRESS, and MISSING;
DATA EXTRACT;
IF _N_ = 1 THEN DO;
PATTERN = PRXPARSE("/\(\d\d\d\) ?\d\d\d-\d{4}/");
IF MISSING(PATTERN) THEN DO;
PUT "ERROR IN COMPILING REGULAR EXPRESSION";
STOP;
END;
END;
RETAIN PATTERN;
LENGTH NUMBER $ 15;
INPUT STRING $CHAR80.;
CALL PRXSUBSTR(PATTERN,STRING,START,LENGTH);
IF START GT 0 THEN DO;
NUMBER = SUBSTRTRING,START,LENGTH);
(S NUMBER = COMPRESS(NUMBER," ");
OUTPUT;
END;
KEEP NUMBER;
DATALINES;
THIS LINE DOES NOT HAVE ANY PHONE NUMBERS ON IT
THIS LINE DOES: (123)345-4567 LA DI LA DI LA
ALSO VALID (123) 999-9999
TWO NUMBERS HERE (333)444-5555 AND (800)123-4567
;
PROC PRINT DATA=EXTRACT NOOBS;
TITLE "Extracted Phone Numbers";
RUN;

/*Program 7: Using the PRXPOSN function to extract the area code and exchange from a phone number*/
***Primary functions: PRXPARSE, PRXMATCH, PRXPOSN
***Other functions: SUBSTR;
RUN;
DATA PIECES;
IF _N_ THEN RE = PRXPARSE("/\((\d\d\d)\) ?(\d\d\d)-\d{4}/");
/*
\( matches an open parenthesis
\d\d\d matches three digits
\) matches a closed parenthesis
b? matches zero or more blanks (b = blank)
\d\d\d matches three digits
- matches a dash
\d{4} matches four digits
*/
RETAIN RE;
INPUT NUMBER $CHAR80.;
MATCH = PRXMATCH(RE,NUMBER);
IF MATCH GT 0 THEN DO;
CALL PRXPOSN(RE,1,AREA_START);
CALL PRXPOSN(RE,2,EX_START,EX_LENGTH);
AREA_CODE = SUBSTR(NUMBER,AREA_START,3);
EXCHANGE = SUBSTR(NUMBER,EX_START,EX_LENGTH);
END;
DROP RE;
DATALINES;
THIS LINE DOES NOT HAVE ANY PHONE NUMBERS ON IT
THIS LINE DOES: (123)345-4567 LA DI LA DI LA
ALSO VALID (609) 999-9999
TWO NUMBERS HERE (333)444-5555 AND (800)123-4567
;
PROC PRINT DATA=PIECES NOOBS HEADING=H;
TITLE "Listing of Data Set PIECES";
RUN;

/*Program 8: Using regular expressions to read very unstructured data*/
***Primary functions: PRSPARSE, PRXMATCH, PRXPOSN
***Other functions: SUBSTR, INPUT;
***This program will read every line of data and, for any line
that contains two or more numbers, will assign the first
number to X and the second number to Y;
DATA READ_NUM;
***Read the first number and second numbers on line;
IF _N_ = 1 THEN RET = PRXPARSE("/(\d+) +\D*(\d+)/");
/*
\d+ matches one or more digits
b+ matches one or more blanks (b = blank)
\D* matches zero or more non-digits
\d+ matches one or more digits
*/
RETAIN RET;
INPUT STRING $CHAR40.;
POS = PRXMATCH(RET,STRING);
IF POS GT 0 THEN DO;
CALL PRXPOSN(RET,1,START1,LENGTH1);
IF START1 GT 0 THEN X = INPUT(SUBSTR(STRING,START1,LENGTH1),9.);
CALL PRXPOSN(RET,2,START2,LENGTH2);
IF START2 GT 0 THEN Y = INPUT(SUBSTR(STRING,START2,LENGTH2),9.);
OUTPUT;
END;
KEEP STRING X Y;
DATALINES;
XXXXXXXXXXXXXXXXXX 9 XXXXXXX 123
This line has a 6 and a 123 in it
456 789
None on this line
Only one here: 77
;
PROC PRINT DATA=READ_NUM NOOBS;
TITLE "Listing of Data Set READ_NUM";
RUN;

/*Program 9: Finding digits in random positions in an input string using CALL PRXNEXT*/
***Primary functions: PRXPARSE, PRXNEXT;
DATA FIND_NUM;
IF _N_ = 1 THEN RET = PRXPARSE("/\d+/");
*Look for one or more digits in a row;
RETAIN RET;
INPUT STRING $40.;
START = 1;
STOP = LENGTH(STRING);
CALL PRXNEXT(RET,START,STOP,STRING,POSITION,LENGTH);
ARRAY X[5];
DO I = 1 TO 5 WHILE (POSITION GT 0);
X[I] = INPUT(SUBSTR(STRING,POSITION,LENGTH),9.);
CALL PRXNEXT(RET,START,STOP,STRING,POSITION,LENGTH);
END;
KEEP X1-X5 STRING;
DATALINES;
THIS 45 LINE 98 HAS 3 NUMBERS
NONE HERE
12 34 78 90
;
PROC PRINT DATA=FIND_NUM NOOBS;
TITLE "Listing of Data Set FIND_NUM";
RUN;

/*Program 10: Demonstrating the PRXPAREN function*/
***Primary functions: PRXPARSE, PRXMATCH, PRXPAREN;
DATA PAREN;
IF _N_ = 1 THEN PATTERN = PRXPARSE("/(\d )|(\d\d )|(\d\d\d )/");
***One or two or three digit number followed by a blank;
RETAIN PATTERN;
INPUT STRING $CHAR30.;
POSITION = PRXMATCH(PATTERN,STRING);
IF POSITION GT 0 THEN WHICH_PAREN = PRXPAREN(PATTERN);
DATALINES;
one single digit 8 here
two 888 77
12345 1234 123 12 1
;
PROC PRINT DATA=PAREN NOOBS;
TITLE "Listing of Data Set PAREN";
RUN;

/*Program 11: Demonstrating the PRXCHANGE function*/
***Primary functions: PRXPARSE, PRXCHANGE;
DATA CAT_AND_MOUSE;
INPUT TEXT $CHAR40.;
LENGTH NEW_TEXT $ 80;
IF _N_ = 1 THEN MATCH = PRXPARSE("s/[Cc]at/Mouse/");
*Replace "Cat" or "cat" with Mouse;
RETAIN MATCH;
CALL PRXCHANGE(MATCH,-1,TEXT,NEW_TEXT,R_LENGTH,TRUNC,N_OF_CHANGES);
IF TRUNC THEN PUT "Note: NEW_TEXT was truncated";
DATALINES;
The Cat in the hat
There are two cat cats in this line
;
PROC PRINT DATA=CAT_AND_MOUSE NOOBS;
TITLE "Listing of CAT_AND_MOUSE";
RUN;

/*Program 12: Demonstrating the use of capture buffers with PRXCHANGE*/
***Primary functions: PRXPARSE, PRXCHANGE;
DATA CAPTURE;
IF _N_ = 1 THEN RETURN = PRXPARSE("S/(\w+ +)(\w+)/$2 $1/");
RETAIN RETURN;
INPUT STRING $20.;
CALL PRXCHANGE(RETURN,-1,STRING);
DATALINES;
Ron Cody
Russell Lynn
;
PROC PRINT DATA=CAPTURE NOOBS;
TITLE "Listing of Data Set CAPTURE");
RUN;

 

抱歉!评论已关闭.