sas中,很多情况下 data setp能够搞定的,proc sql也可以搞定。论坛里面经常碰到一题多解,例如汇总、重复数据、数据合并等问题。
很多网友也曾作过一些汇总,且巧浏览网页时发现NESUG 2011的一篇文章总结的相当齐全,特此摘录文中例子。感兴趣的网友可以自己google,原文题目《PROC SQL for DATA Step Die-hards》,作者:Christianna S. Williams。
EXAMPLE 1: SUBSETTING VARIABLES (COLUMNS)
*DATA step code; DATA selvar1 ; SET ex.admissions (KEEP = pt_id admdate disdate); RUN; *SQL code; PROC SQL; CREATE TABLE selvar2 AS SELECT pt_id, admdate, disdate FROM ex.admissions ; QUIT;
EXAMPLE 2: SELECTING OBSERVATIONS (ROWS)
*DATA step code; DATA vahosp1 ; SET ex.admissions (WHERE = (hosp EQ 3)); RUN; *SQL code; PROC SQL FEEDBACK; CREATE TABLE vahosp2 AS SELECT * FROM ex.admissions WHERE hosp EQ 3; QUIT;
EXAMPLE 3: CREATING A NEW VARIABLE
*DATA step code; DATA grouping ; SET ex.admissions ; LENGTH dxgrp $5 ; IF primdx EQ: '410' THEN dxgrp = 'MI' ; ELSE IF primdx EQ: '428' THEN dxgrp = 'CHF'; ELSE dxgrp = 'other' ; RUN; *SQL code; PROC SQL; CREATE TABLE grouping2 AS SELECT *, CASE WHEN primdx LIKE '410%' THEN 'MI' WHEN primdx LIKE '428%' THEN 'CHF' ELSE 'other' END AS dxgrp FROM ex.admissions; QUIT;
EXAMPLE 4: SELECTING ROWS BASED ON A CREATED VARIABLE
*DATA Step code; DATA twowks1 ; SET ex.admissions (KEEP = pt_id hosp admdate disdate) ; ATTRIB los LENGTH=4 LABEL='Length of Stay'; los = (disdate - admdate) + 1; IF los GE 14 ; RUN; *SQL code; PROC SQL; CREATE TABLE twowks2 AS SELECT pt_id, hosp, admdate, disdate, (disdate-admdate) + 1 AS los LENGTH=4 LABEL=’Length of Stay’ FROM ex.admissions WHERE CALCULATED los GE 14; QUIT;
EXAMPLE 5: SELECTING ROWS IN ONE TABLE BASED ON INFORMATION FROM ANOTHER TABLE
*DATA Step Code; PROC SORT DATA = ex.admissions OUT=admits; BY hosp ; RUN; DATA vahosp1d (DROP = hospname) ; MERGE admits (IN=adm) ex.hospitals (IN=va KEEP = hosp_id hospname RENAME = (hosp_id=hosp) WHERE = (hospname EQ: 'Veteran')); BY hosp ; IF adm AND va; RUN; PROC SORT; BY pt_id admdate; RUN; *PROC SQL Code; PROC SQL ; CREATE TABLE vahosp2d AS SELECT * FROM ex.admissions WHERE hosp IN (SELECT hosp_id FROM ex.hospitals WHERE hospname LIKE "Veteran%") ORDER BY pt_id, admdate ; QUIT;
EXAMPLE 6: USING SUMMARY FUNCTIONS
*DATA Step Code; DATA admsum1 ; SET ex.admissions ; BY pt_id; ** (1) Initialization; IF FIRST.pt_id THEN DO; nstays = 0; minlos = .; maxlos = .; END; ** (2) Accumulation; nstays = nstays + 1; los = (disdate - admdate) + 1; minlos = MIN(OF minlos los) ; maxlos = MAX(OF maxlos los) ; ** (3) Output; IF LAST.pt_id THEN OUTPUT ; RETAIN nstays minlos maxlos ; KEEP pt_id nstays minlos maxlos ; RUN; PROC SQL code: PROC SQL; CREATE TABLE admsum2 AS SELECT pt_id, COUNT(*) AS nstays, MIN(disdate - admdate + 1) AS minlos, MAX(disdate - admdate + 1) AS maxlos FROM ex.admissions GROUP BY pt_id ; QUIT;
EXAMPLE 7: SELECTION BASED ON SUMMARY FUNCTIONS
*DATA Step Code; PROC SUMMARY DATA= ex.admissions ; VAR bp_sys ; OUTPUT OUT=bpstats MEAN(bp_sys)= mean_sys STD(bp_sys) = sd_sys ; RUN; DATA hi_sys1 ; SET bpstats (keep=mean_sys sd_sys) ex.admissions ; IF _N_ EQ 1 THEN DO; high = mean_sys + 2*(sd_sys) ; low = mean_sys - 2*(sd_sys) ; DELETE; END; RETAIN high low; IF (bp_sys GE high) OR (bp_sys LE low) ; DROP mean_sys sd_sys high low ; RUN; *PROC SQL Code; PROC SQL ; CREATE TABLE hi_sys2 AS SELECT * FROM ex.admissions WHERE (bp_sys GE (SELECT MEAN(bp_sys)+ 2*STD(bp_sys) FROM ex.admissions)) OR (bp_sys LE (SELECT MEAN(bp_sys) - 2*STD(bp_sys) FROM ex.admissions)); QUIT;
EXAMPLE 8: SELECTION BASED ON SUMMARY FUNCTION WITH “RE-MERGE”
*DATA Step Code; PROC SUMMARY DATA= ex.admissions NWAY; CLASS dest ; VAR bp_sys ; OUTPUT OUT=bpstats2 MEAN(bp_sys)=mean_sys STD(bp_sys)=sd_sys ; RUN; PROC SORT DATA = ex.admissions OUT=admissions; BY dest ; RUN; DATA hi_sys3 ; MERGE admissions (KEEP = pt_id bp_sys bp_dia dest) bpstats2 (KEEP = dest mean_sys sd_sys); BY dest ; IF bp_sys GE mean_sys + 2*(sd_sys) OR bp_sys LE mean_sys - 2*(sd_sys) ; FORMAT mean_sys sd_sys 6.2; RUN; *PROC SQL Code; PROC SQL; CREATE TABLE hi_sys4 AS SELECT pt_id, bp_sys, bp_dia, dest, MEAN(bp_sys) AS mean_sys FORMAT=6.2, STD(bp_sys) AS sd_sys FORMAT=6.2 FROM ex.admissions GROUP BY dest HAVING bp_sys GE (mean_sys + 2*sd_sys) OR bp_sys LE (mean_sys – 2*sd_sys) ; QUIT;
EXAMPLE 9: IDENTIFYING DUPLICATES
*DATA Step Code; DATA selmd1 ; SET ex.doctors (KEEP = md_id lastname hospadm RENAME = (hospadm=hospital)); BY md_id ; IF NOT (FIRST.md_id AND LAST.md_id) ; RUN; PROC SORT DATA=selmd1; BY lastname hospital ; RUN; *PROC SQL Code; PROC SQL ; CREATE TABLE selmd2 AS SELECT md_id, lastname, hospadm AS hospital FROM ex.doctors GROUP BY md_id HAVING COUNT(*) GE 2 ORDER BY lastname, hospital ; QUIT;
EXAMPLE 10: CREATION OF TWO DATA SETS FROM ONE
*DATA Step Code; DATA admit06 admit07 ; SET ex.admissions ; IF YEAR(admdate) = 2006 THEN OUTPUT admit06; ELSE IF YEAR(admdate) = 2007 THEN OUTPUT admit07; RUN; *PROC SQL Code; PROC SQL ; CREATE TABLE admit06 AS SELECT * FROM ex.admissions WHERE YEAR(admdate) = 2006; CREATE TABLE admit07 AS SELECT * FROM ex.admissions WHERE YEAR(admdate) = 2007; QUIT;
EXAMPLE 11: CONCATENATION
*DATA Step Code; DATA alladm1 ; SET admit06 admit07 ; BY pt_id ; RUN; *PROC SQL Code; PROC SQL ; CREATE TABLE alladm2 AS SELECT * FROM admit06 UNION ALL CORRESPONDING SELECT * FROM admit07 ORDER BY pt_id; QUIT;
EXAMPLE 12: SELECTING RECORDS UNIQUE TO ONE TABLE
*DATA Step Code; DATA only2006 ; MERGE admit06 (IN=in06 keep = pt_id) admit07 (IN=in07 keep = pt_id); BY pt_id ; IF in06 AND NOT in07 ; IF FIRST.pt_id ; RUN; *PROC SQL Code; PROC SQL ; CREATE TABLE only2006 AS SELECT pt_id FROM admit06 EXCEPT SELECT pt_id FROM admit07; QUIT; *ALTERNATIVE PROC SQL Code; PROC SQL ; CREATE TABLE only2006_A AS SELECT * FROM admit06 WHERE pt_id NOT IN (SELECT pt_ID FROM admit07) ; QUIT;
EXAMPLE 13: INNER JOIN OF TWO TABLES
*DATA Step Code; DATA admits1 ; MERGE ex.admissions (IN=adm KEEP = pt_id admdate disdate hosp md) ex.patients (IN=pts KEEP = id lastname sex primmd RENAME = (id=pt_id)); BY pt_id ; IF adm AND pts; RUN; *PROC SQL code; PROC SQL ; CREATE TABLE admits2 AS SELECT pt_id, admdate, disdate, hosp, md, lastname, sex, primmd FROM ex.admissions AS a, ex.patients AS b WHERE a.pt_id = b.id ORDER BY a.pt_id, admdate ; QUIT; *Alternative PROC SQL code; PROC SQL ; CREATE TABLE admits2 AS SELECT pt_id, admdate, disdate, hosp, md, lastname, sex, primmd FROM ex.admissions INNER JOIN ex.patients ON pt_id = id ORDER BY pt_id, admdate ; QUIT;
EXAMPLE 14: JOIN OF THREE TABLES WITH ROW SELECTION
*DATA Step Code; DATA died1 (RENAME = (disdate=dthdate)) ; MERGE ex.admissions (IN=dth KEEP = pt_id disdate hosp dest WHERE = (dest=9)) ex.patients (IN=pts KEEP = id birthdate RENAME = (id=pt_id)); BY pt_id ; IF dth AND pts ; agedth = FLOOR((disdate-birthdate)/365.25) ; DROP dest birthdate ; RUN; PROC SORT DATA=died1; BY hosp; RUN; DATA died1b ; MERGE died1 (IN=dth RENAME=(hosp=hosp_id)) ex.hospitals (IN=hsp KEEP=hosp_id nbeds); BY hosp_id ; IF dth AND hsp ; DROP hosp_id; RUN; PROC SORT; BY pt_id ; RUN; *PROC SQL code; PROC SQL ; CREATE TABLE died2 AS SELECT pt_id, nbeds, disdate AS dthdate, INT((disdate-birthdate)/365.25) AS agedth FROM ex.admissions, ex.hospitals, ex.patients WHERE (pt_id = id) AND (hosp = hosp_id) AND dest EQ 9 ORDER BY pt_id ; QUIT;
EXAMPLE 15: LEFT OUTER JOIN
*DATA Step Code; PROC SORT DATA = ex.admissions (KEEP = hosp) OUT=admits (RENAME=(hosp=hosp_id)) NODUPKEY; BY hosp ; RUN; DATA hosps1 ; MERGE ex.hospitals (IN=hosp) admits (IN=adm); BY hosp_id ; IF hosp ; hasadmit = adm ; RUN; *PROC SQL code; PROC SQL ; CREATE TABLE hosps2 AS SELECT DISTINCT a.*, hosp IS NOT NULL AS hasadmit FROM ex.hospitals a LEFT JOIN ex.admissions b ON a.hosp_id = b.hosp ; QUIT;
EXAMPLE 16: INNER JOIN WITH A SUBQUERY
*DATA Step Code; DATA primdoc (DROP = primmd); MERGE ex.admissions (IN=adm KEEP = pt_id admdate disdate hosp md) ex.patients (IN=pts KEEP = id lastname primmd RENAME=(id=pt_id)); BY pt_id ; IF adm AND pts AND (md EQ primmd) ; RUN; PROC SORT DATA=primdoc; BY md; RUN; DATA doctors ; SET ex.doctors (KEEP = md_id lastname); BY md_id ; IF FIRST.md_id ; RUN; DATA primdoc1a ; MERGE primdoc (IN=p RENAME=(lastname=ptname md=md_id)) doctors (RENAME = (lastname=mdname)); BY md_id ; IF p ; RUN; PROC SORT DATA=primdoc1a ; BY pt_id admdate; RUN; *PROC SQL Code; PROC SQL ; CREATE TABLE primdoc2 AS SELECT pt_id, admdate, disdate, hosp, md_id, b.lastname AS ptname, c.lastname AS mdname FROM ex.admissions a, ex.patients b, (SELECT DISTINCT md_id, lastname FROM ex.doctors) c WHERE (a.pt_id EQ b.id) AND (a.md EQ b.primmd) AND (a.md EQ c.md_id) ORDER BY a.pt_id, admdate ; QUIT;
EXAMPLE 17: A CORRELATED SUBQUERY
*DATA Step Code; PROC SORT DATA = ex.admissions (KEEP=md hosp) OUT = admits; BY md; RUN; PROC SORT DATA = ex.doctors OUT=doctors NODUPKEY ; BY md_id ; RUN; DATA vadocs1 (DROP = hosp); MERGE doctors (IN=docs KEEP=md_id lastname) admits (IN=adm WHERE=(hosp = 3) RENAME = (md=md_id)) ; BY md_id; IF docs AND adm AND FIRST.md_id ; RUN; PROC SORT; BY lastname; RUN; *PROC SQL Code; PROC SQL; CREATE TABLE vadocs2 AS SELECT DISTINCT md_id, lastname FROM ex.doctors AS d WHERE 3 IN (SELECT hosp FROM ex.admissions AS a WHERE d.md_id = a.md) ORDER BY lastname; QUIT;