现在的位置: 首页 > 综合 > 正文

PROC SQL & DATA Step do the same job

2018年02月19日 ⁄ 综合 ⁄ 共 7598字 ⁄ 字号 评论关闭

sas中,很多情况下 data setp能够搞定的,proc sql也可以搞定。论坛里面经常碰到一题多解,例如汇总、重复数据、数据合并等问题。

很多网友也曾作过一些汇总,且巧浏览网页时发现NESUG 2011的一篇文章总结的相当齐全,特此摘录文中例子。感兴趣的网友可以自己google,原文题目《PROC SQL for DATA Step Die-hards》,作者:Christianna S. Williams。

EXAMPLE 1: SUBSETTING VARIABLES (COLUMNS)

*DATA step code;
DATA selvar1 ;
   SET ex.admissions (KEEP = pt_id admdate disdate);
RUN;
*SQL code;
PROC SQL;
   CREATE TABLE selvar2 AS
   SELECT pt_id, admdate, disdate
   FROM ex.admissions ;
QUIT;

EXAMPLE 2: SELECTING OBSERVATIONS (ROWS)

*DATA step code;
DATA vahosp1 ;
   SET ex.admissions (WHERE = (hosp EQ 3));
RUN;
*SQL code;
PROC SQL FEEDBACK;
   CREATE TABLE vahosp2 AS
   SELECT *
   FROM ex.admissions
   WHERE hosp EQ 3;
QUIT;

EXAMPLE 3: CREATING A NEW VARIABLE

*DATA step code;
DATA grouping ;
   SET ex.admissions ;
   LENGTH dxgrp $5 ;
   IF primdx EQ: '410' THEN dxgrp = 'MI' ;
   ELSE IF primdx EQ: '428' THEN dxgrp = 'CHF';
   ELSE dxgrp = 'other' ;
RUN;
*SQL code;
PROC SQL;
   CREATE TABLE grouping2 AS
   SELECT *,
   CASE
      WHEN primdx LIKE '410%' THEN 'MI'
      WHEN primdx LIKE '428%' THEN 'CHF'
      ELSE 'other'
   END AS dxgrp
   FROM ex.admissions;
QUIT;

EXAMPLE 4: SELECTING ROWS BASED ON A CREATED VARIABLE

*DATA Step code;
DATA twowks1 ;
   SET ex.admissions (KEEP = pt_id hosp admdate disdate) ;
   ATTRIB los LENGTH=4 LABEL='Length of Stay';
          los = (disdate - admdate) + 1;
   IF los GE 14 ;
RUN;
*SQL code;
PROC SQL;
   CREATE TABLE twowks2 AS
   SELECT pt_id, hosp, admdate, disdate,
         (disdate-admdate) + 1 AS los LENGTH=4 LABEL=’Length of Stay’
   FROM ex.admissions
   WHERE CALCULATED los GE 14;
QUIT;

EXAMPLE 5: SELECTING ROWS IN ONE TABLE BASED ON INFORMATION FROM ANOTHER TABLE

*DATA Step Code;
PROC SORT DATA = ex.admissions OUT=admits;
   BY hosp ;
RUN;
DATA vahosp1d (DROP = hospname) ;
   MERGE admits (IN=adm)
         ex.hospitals (IN=va KEEP = hosp_id hospname
                       RENAME = (hosp_id=hosp)
                       WHERE = (hospname EQ: 'Veteran'));
   BY hosp ;
   IF adm AND va;
RUN;
PROC SORT;
   BY pt_id admdate;
RUN;

  *PROC SQL Code;
PROC SQL ;
   CREATE TABLE vahosp2d AS
   SELECT *
   FROM ex.admissions
      WHERE hosp IN
        (SELECT hosp_id
         FROM ex.hospitals
         WHERE hospname LIKE "Veteran%")
   ORDER BY pt_id, admdate ;
QUIT;

EXAMPLE 6: USING SUMMARY FUNCTIONS

*DATA Step Code;
DATA admsum1 ;
   SET ex.admissions ;
   BY pt_id;
** (1) Initialization;
   IF FIRST.pt_id THEN DO;
      nstays = 0;
      minlos = .;
      maxlos = .;
   END;
** (2) Accumulation;
   nstays = nstays + 1;
   los = (disdate - admdate) + 1;
   minlos = MIN(OF minlos los) ;
   maxlos = MAX(OF maxlos los) ;
** (3) Output;
   IF LAST.pt_id THEN OUTPUT ;
   RETAIN nstays minlos maxlos ;
   KEEP pt_id nstays minlos maxlos ;
RUN;

PROC SQL code:
PROC SQL;
   CREATE TABLE admsum2 AS
   SELECT pt_id, COUNT(*) AS nstays,
          MIN(disdate - admdate + 1) AS minlos,
          MAX(disdate - admdate + 1) AS maxlos
   FROM ex.admissions
   GROUP BY pt_id ;
QUIT;

EXAMPLE 7: SELECTION BASED ON SUMMARY FUNCTIONS

*DATA Step Code;
PROC SUMMARY DATA= ex.admissions ;
   VAR bp_sys ;
   OUTPUT OUT=bpstats MEAN(bp_sys)= mean_sys STD(bp_sys) = sd_sys ;
RUN;
DATA hi_sys1 ;
   SET bpstats (keep=mean_sys sd_sys)
       ex.admissions ;
   IF _N_ EQ 1 THEN DO;
      high = mean_sys + 2*(sd_sys) ;
      low = mean_sys - 2*(sd_sys) ;
      DELETE;
   END;
   RETAIN high low;
   IF (bp_sys GE high) OR (bp_sys LE low) ;
   DROP mean_sys sd_sys high low ;
RUN;

*PROC SQL Code;
PROC SQL ;
   CREATE TABLE hi_sys2 AS
   SELECT * FROM ex.admissions
   WHERE (bp_sys GE
            (SELECT MEAN(bp_sys)+ 2*STD(bp_sys)
             FROM ex.admissions))
          OR (bp_sys LE
            (SELECT MEAN(bp_sys) - 2*STD(bp_sys)
             FROM ex.admissions));
QUIT;

EXAMPLE 8: SELECTION BASED ON SUMMARY FUNCTION WITH “RE-MERGE”

*DATA Step Code;
PROC SUMMARY DATA= ex.admissions NWAY;
   CLASS dest ;
   VAR bp_sys ;
   OUTPUT OUT=bpstats2 MEAN(bp_sys)=mean_sys STD(bp_sys)=sd_sys ;
RUN;
PROC SORT DATA = ex.admissions OUT=admissions;
   BY dest ;
RUN;
DATA hi_sys3 ;
   MERGE admissions (KEEP = pt_id bp_sys bp_dia dest)
         bpstats2 (KEEP = dest mean_sys sd_sys);
   BY dest ;
   IF bp_sys GE mean_sys + 2*(sd_sys) OR
      bp_sys LE mean_sys - 2*(sd_sys) ;
   FORMAT mean_sys sd_sys 6.2;
RUN;

*PROC SQL Code;
PROC SQL;
   CREATE TABLE hi_sys4 AS
   SELECT pt_id, bp_sys, bp_dia, dest,
          MEAN(bp_sys) AS mean_sys FORMAT=6.2,
          STD(bp_sys) AS sd_sys FORMAT=6.2
   FROM ex.admissions
   GROUP BY dest
   HAVING bp_sys GE (mean_sys + 2*sd_sys)
          OR bp_sys LE (mean_sys – 2*sd_sys) ;
QUIT;

EXAMPLE 9: IDENTIFYING DUPLICATES

*DATA Step Code;
DATA selmd1 ;
   SET ex.doctors (KEEP = md_id lastname hospadm
                   RENAME = (hospadm=hospital));
   BY md_id ;
   IF NOT (FIRST.md_id AND LAST.md_id) ;
RUN;
PROC SORT DATA=selmd1; 
   BY lastname hospital ;
RUN;

*PROC SQL Code;
PROC SQL ;
   CREATE TABLE selmd2 AS
   SELECT md_id, lastname, hospadm AS hospital
   FROM ex.doctors
   GROUP BY md_id
   HAVING COUNT(*) GE 2
   ORDER BY lastname, hospital ;
QUIT;

EXAMPLE 10: CREATION OF TWO DATA SETS FROM ONE

*DATA Step Code;
DATA admit06 admit07 ;
   SET ex.admissions ;
   IF YEAR(admdate) = 2006 THEN OUTPUT admit06;
   ELSE IF YEAR(admdate) = 2007 THEN OUTPUT admit07;
RUN;

*PROC SQL Code;
PROC SQL ;
   CREATE TABLE admit06 AS
     SELECT * 
   FROM ex.admissions
     WHERE YEAR(admdate) = 2006;
   CREATE TABLE admit07 AS
     SELECT * FROM ex.admissions
   WHERE YEAR(admdate) = 2007;
QUIT;

EXAMPLE 11: CONCATENATION

*DATA Step Code;
DATA alladm1 ;
   SET admit06 admit07 ;
   BY pt_id ;
RUN;

*PROC SQL Code;
PROC SQL ;
   CREATE TABLE alladm2 AS
     SELECT * 
     FROM admit06
   UNION ALL CORRESPONDING
     SELECT * 
     FROM admit07
   ORDER BY pt_id;
QUIT;

EXAMPLE 12: SELECTING RECORDS UNIQUE TO ONE TABLE

*DATA Step Code;
DATA only2006 ;
   MERGE admit06 (IN=in06 keep = pt_id)
         admit07 (IN=in07 keep = pt_id);
   BY pt_id ;
   IF in06 AND NOT in07 ;
   IF FIRST.pt_id ;
RUN;

*PROC SQL Code;
PROC SQL ;
   CREATE TABLE only2006 AS
   SELECT pt_id 
     FROM admit06
   EXCEPT
   SELECT pt_id
     FROM admit07;
QUIT;

*ALTERNATIVE PROC SQL Code;
PROC SQL ;
   CREATE TABLE only2006_A AS
   SELECT *
   FROM admit06
   WHERE pt_id NOT IN
        (SELECT pt_ID FROM admit07) ;
QUIT;

EXAMPLE 13: INNER JOIN OF TWO TABLES

*DATA Step Code;
DATA admits1 ;
   MERGE ex.admissions (IN=adm KEEP = pt_id admdate disdate hosp md)
         ex.patients (IN=pts KEEP = id lastname sex primmd
                      RENAME = (id=pt_id));
   BY pt_id ;
   IF adm AND pts;
RUN;

*PROC SQL code;
PROC SQL ;
   CREATE TABLE admits2 AS
   SELECT pt_id, admdate, disdate, hosp, md, lastname, sex, primmd
   FROM ex.admissions AS a,
        ex.patients AS b
   WHERE a.pt_id = b.id
   ORDER BY a.pt_id, admdate ;
QUIT;

*Alternative PROC SQL code;
PROC SQL ;
   CREATE TABLE admits2 AS
   SELECT pt_id, admdate, disdate, hosp, md, lastname, sex, primmd
   FROM ex.admissions INNER JOIN
        ex.patients
   ON pt_id = id
   ORDER BY pt_id, admdate ;
QUIT;

EXAMPLE 14: JOIN OF THREE TABLES WITH ROW SELECTION

*DATA Step Code;
DATA died1 (RENAME = (disdate=dthdate)) ;
   MERGE ex.admissions (IN=dth KEEP = pt_id disdate hosp dest
                        WHERE = (dest=9))
         ex.patients (IN=pts KEEP = id birthdate RENAME = (id=pt_id));
   BY pt_id ;
   IF dth AND pts ;
   agedth = FLOOR((disdate-birthdate)/365.25) ;
   DROP dest birthdate ;
RUN;
PROC SORT DATA=died1;
   BY hosp;
RUN;
DATA died1b ;
   MERGE died1 (IN=dth RENAME=(hosp=hosp_id))
         ex.hospitals (IN=hsp KEEP=hosp_id nbeds);
   BY hosp_id ;
   IF dth AND hsp ;
   DROP hosp_id;
RUN;
PROC SORT;
   BY pt_id ;
RUN;

*PROC SQL code;
PROC SQL ;
   CREATE TABLE died2 AS
   SELECT pt_id, nbeds, disdate AS dthdate,
          INT((disdate-birthdate)/365.25) AS agedth
   FROM ex.admissions, 
        ex.hospitals, 
        ex.patients
   WHERE (pt_id = id) AND (hosp = hosp_id) AND dest EQ 9
   ORDER BY pt_id ;
QUIT;

EXAMPLE 15: LEFT OUTER JOIN

*DATA Step Code;
PROC SORT DATA = ex.admissions (KEEP = hosp)
   OUT=admits (RENAME=(hosp=hosp_id)) NODUPKEY;
   BY hosp ;
RUN;
DATA hosps1 ;
   MERGE ex.hospitals (IN=hosp)
         admits (IN=adm);
   BY hosp_id ;
   IF hosp ;
   hasadmit = adm ;
RUN;

*PROC SQL code;
PROC SQL ;
   CREATE TABLE hosps2 AS
      SELECT DISTINCT a.*, hosp IS NOT NULL AS hasadmit
   FROM ex.hospitals a 
   LEFT JOIN
        ex.admissions b
   ON a.hosp_id = b.hosp ;
QUIT;

EXAMPLE 16: INNER JOIN WITH A SUBQUERY

*DATA Step Code;
DATA primdoc (DROP = primmd);
   MERGE ex.admissions (IN=adm KEEP = pt_id admdate disdate hosp md)
         ex.patients (IN=pts KEEP = id lastname primmd RENAME=(id=pt_id));
   BY pt_id ;
   IF adm AND pts AND (md EQ primmd) ;
RUN;
PROC SORT DATA=primdoc; BY md; RUN;
DATA doctors ;
   SET ex.doctors (KEEP = md_id lastname);
   BY md_id ;
   IF FIRST.md_id ;
RUN;
DATA primdoc1a ;
   MERGE primdoc (IN=p RENAME=(lastname=ptname md=md_id))
         doctors (RENAME = (lastname=mdname));
   BY md_id ;
   IF p ;
RUN;
PROC SORT DATA=primdoc1a ;
   BY pt_id admdate;
RUN;

*PROC SQL Code;
PROC SQL ;
   CREATE TABLE primdoc2 AS
   SELECT pt_id, admdate, disdate, hosp, md_id,
          b.lastname AS ptname,
          c.lastname AS mdname
   FROM ex.admissions a, 
        ex.patients b,
       (SELECT DISTINCT md_id, lastname
        FROM ex.doctors) c
   WHERE (a.pt_id EQ b.id) AND
         (a.md EQ b.primmd) AND
         (a.md EQ c.md_id)
   ORDER BY a.pt_id, admdate ;
QUIT;

EXAMPLE 17: A CORRELATED SUBQUERY

*DATA Step Code;
PROC SORT DATA = ex.admissions (KEEP=md hosp) OUT = admits;
   BY md;
RUN;
PROC SORT DATA = ex.doctors OUT=doctors NODUPKEY ;
   BY md_id ;
RUN;
DATA vadocs1 (DROP = hosp);
   MERGE doctors (IN=docs KEEP=md_id lastname)
         admits (IN=adm WHERE=(hosp = 3) RENAME = (md=md_id)) ;
   BY md_id;
   IF docs AND adm AND FIRST.md_id ;
RUN;
PROC SORT;
   BY lastname; 
RUN;

*PROC SQL Code;
PROC SQL;
   CREATE TABLE vadocs2 AS
   SELECT DISTINCT md_id, lastname 
   FROM ex.doctors AS d
   WHERE 3 IN (SELECT hosp 
               FROM ex.admissions AS a
               WHERE d.md_id = a.md)
   ORDER BY lastname;
QUIT;

抱歉!评论已关闭.