RemoveDuplicates.sas

/****************************************************************************/
/*  Program Name  : RemoveDuplicates                                        */
/*  Purpose       : Macro utility to remove duplicate observations from a   */
/*                  dataset.                                                */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Author        : Matt Trzcinski                                          */
/*  Last Update   : 2017/06/28                                              */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Input(s)      : SAS dataset from the WORK library                       */
/*  Output(s)     : Dataset with duplicates removed.  Default output name   */
/*                  is input data set with '_less_dups' suffix.             */
/*  Usage         : Macro has 6 parameters. *indicates required parameter.  */
/*                                                                          */
/*              1.*from=          Dataset to remove observations from.      */
/*                                If using default out= (see below), then   */
/*                                the length of this dataset name cannot    */
/*                                exceed 22 characters.                     */
/*              2.by=             What variables defines a duplicate.  List */
/*                                variables separated with a space.  May be */
/*                                given as a vertical list.  Default is     */
/*                                _all_.                                    */
/*              3.keep=           Must be 'first' or 'last'. Choose to      */
/*                                keep first or last occurrence of          */
/*                                duplicate value.  Default is 'first'.     */
/*              4.out=            Name of data set with duplicates removed. */
/*                                Cannot be longer than 32 characters.      */
/*                                Default is the input data set with        */
/*                                '_less_dups' suffix.                      */
/*              5.showdups=       Output data set containing duplicate      */
/*                                values.  Includes column for row in which */
/*                                the duplicate appears in the original     */
/*                                data set.  Must be 'true' or 'false'.     */
/*                                Default is 'false'.                       */
/*              6.ignoremissing=  Do not remove rows with missing values.   */
/*                                That is, missing values do not define a   */
/*                                BY group. See note below. Default option  */
/*                                is 'false'.                               */
/*              6.debug=          Toggle debug mode.  Display all notes and */
/*                                temporary data sets.  Must be 'true' or   */
/*                                'false'.  Default is 'false'.             */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Example       : %RemoveDuplicates(from=test, by=key1 key2);             */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Template      : %RemoveDuplicates(                                      */
/*                               from=                                      */
/*                    ,            by=                                      */
/*                    ,          keep=                                      */
/*                    ,           out=                                      */
/*                    ,      showdups=                                      */
/*                    , ignoremissing=                                      */
/*                    ,         debug=                                      */
/*                  );                                                      */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Note          : Removing duplicates can be thought of as selecting one  */
/*                  observation from each of a group of 'unique'            */
/*                  observations. Groups are defined with a BY statement.   */
/*                  That is, uniqueness/similarity is determined via the    */
/*                  BY statement. Sorting arranges the observations within  */
/*                  each subsequent BY variable.  Because of this, once the */
/*                  data has been sorted, the selection of a single         */
/*                  observation from each grouping can be made with first.  */
/*                  or last. of the last BY variable                        */
/*                                                                          */
/*                  Output is arranged in the same order as the original    */
/*                  data set but with duplicates removed.                   */
/*                                                                          */
/* Developer      : Cannot handle two-part names. For instance,             */
/*                  %RemoveDuplicates(from=sashelp.cars); will generate an  */
/*                  error.                                                  */
/*                                                                          */
/*                  Error handling needs code cleaning.  Needs better line  */
/*                  documentation.                                          */
/*                                                                          */
/****************************************************************************/

/****************************************************************************/
/*  Program Name  : GetObsCount                                             */
/*  Purpose       : Return the number of observations in a given data set.  */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Author        : Matt Trzcinski                                          */
/*  Last Update   : 2016/12/16                                              */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Input(s)      : SAS dataset                                             */
/*  Output(s)     : Number of observations as string                        */
/*  Usage         : Cannot be called in open code.  Must be part of an      */
/*                  expression or statement.                                */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Example       : %put There are %GetObsCount(sashelp.cars) observations; */
/*                                                                          */
/*--------------------------------------------------------------------------*/
/*  Note          : Modified from http://support.sas.com/kb/24/671.html     */
/*                                                                          */
/****************************************************************************/
%macro GetObsCount(dataset);
  /* Open assigns ID to open data set.  Assigns 0 if DNE */
  %let exists = %sysfunc(open(&dataset));
  %if &exists. %then %do;
    %let obsCount = %sysfunc(attrn(&exists, nobs));
    %let closed   = %sysfunc(close(&exists));
    %end;
  /* Output error if no dataset */
  %else %put %sysfunc(sysmsg());
  &obsCount
%mend;

%macro SetSystemOptions(opt1, opt2, opt3);
  options &opt1. &opt2. &opt3.;
%mend;

%macro RemoveDuplicates(
                                from=
                      ,            by=_all_
                      ,          keep=first
                      ,           out=
                      ,      showdups=false
                      , ignoremissing=false
                      ,         debug=false
                      );

  %let startTime = %sysfunc(datetime());

  %let originalNOTES = %sysfunc(getoption(notes));

  %if &debug. = false %then %do;
    %SetSystemOptions(nonotes);
    %end;

********************************************************************

** Verify inputs

********************************************************************;
  %if (&from.= )%then %do;
    %put ERROR: [MACRO] The from= option is blank.;
    %SetSystemOptions(&originalNOTES.);
    %abort cancel;
    %end;

  %if (%lowcase(&keep.) ^= first and %lowcase(&keep.) ^= last) %then %do;
    %put ERROR: [MACRO] Keep option must be either 'first' or 'last'.  Default is keep=first.;
    %SetSystemOptions(&originalNOTES.);
    %abort cancel;
    %end;

  %if (%lowcase(&showdups.) ^= true and %lowcase(&showdups.) ^= false) %then %do;
    %put ERROR: [MACRO] Showdups option must be either 'true' or 'false'.  Default is showdups=false.;
    %SetSystemOptions(&originalNOTES.);
    %abort cancel;
    %end;

  %if (%lowcase(&ignoremissing.) ^= true and %lowcase(&ignoremissing.) ^= false) %then %do;
    %put ERROR: [MACRO] Ignoremissing option must be either 'true' or 'false'.  Default is ignoremissing=false.;
    %SetSystemOptions(&originalNOTES.);
    %abort cancel;
    %end;

  %if (%lowcase(&debug.) ^= true and %lowcase(&debug.) ^= false) %then %do;
    %put ERROR: [MACRO] Debug option must be either 'true' or 'false'.  Default is debug=false.;
    %SetSystemOptions(&originalNOTES.);
    %abort cancel;
    %end;

  %if ((&out.= ) and %length(&from.) > 22) %then %do;
    data _null_;
      put "WARNING: [MACRO] from= dataset name exceeds 22 character limit."
          "                 Name truncated to accomodate default.";
    run;
    %let out = %substr(&from, 1, 22)_less_dups;
    option notes;
    %put NOTE: [MACRO] Created output dataset %upcase(&out.).;
    option nonotes;
  %end;

  %else %if %length(&out.) > 32 %then %do;
    data _null_;
      put "WARNING: [MACRO] Output dataset name exceeds 32 character limit."
          "                 Name truncated to fit length requirement.";
    run;
    %let out = %substr(&out, 1, 32);
    option notes;
    %put NOTE: [MACRO] Created output dataset %upcase(&out.).;
    option nonotes;
  %end;

  %else %if (&out.= ) %then %do;
    %let out = &from._less_dups;
    %SetSystemOptions(notes);
    %put NOTE: [MACRO] Created output dataset %upcase(&out.).;
    %SetSystemOptions(nonotes);
    %end;

  %else %do;
    %SetSystemOptions(notes);
    %put NOTE: [MACRO] Created output dataset: %upcase(&out.).;
    %SetSystemOptions(nonotes);
    %end;

********************************************************************

** Determine last BY group variable

********************************************************************;

**********************************

*** Get list of variables

**********************************;
  %if &by. = _all_ %then %do;
    proc sql noprint;
      select
        name into :variables separated by ' '
      from
        dictionary.columns
      where
        memname=upcase("&from.")
      ;
    quit;

**********************************

*** Select last variable in data set

**********************************;
    %let lastByVar = %scan(&variables., -1, ' ');
  %end;
  %else
    /* select last &by.= variable */
    %let lastByVar = %scan(&by., -1, ' ');

********************************************************************

** Remove duplicates

********************************************************************;

**********************************

*** Add index

**********************************;
  data _indexed;
    set &from.;
    _index_ = _N_;
  run;

**********************************

*** Create BY groups

**********************************;
  proc sort data = _indexed out = _sorted;
    by &by.;
  run;

**********************************

*** Select single observation

*** from each BY group

**********************************;
  %if &ignoremissing = true %then %do;
    data _less_dups;
      set _sorted;
      by &by.;
      if (&keep..&lastByVar. or missing(&lastByVar.)) then output;
    run;
  %end;
  %else %do;
    data _less_dups;
      set _sorted;
      by &by.;
      if &keep..&lastByVar. then output;
    run;
  %end;

**********************************

*** Return data set to original

*** order, less dups

**********************************;
  proc sort data = _less_dups out = &out. (drop=_index_);
    by _index_;
  run;

********************************************************************

** Show duplicates

********************************************************************;
  /*Placing _index_ first is non-trivial.

    May be possible with PROC SQL */
  %if &showdups = true %then %do;
    %if &ignoremissing = true %then %do;
      data &from._dups;
        set _sorted (rename = (_index_ = _fromRow_));
        by &by.;

        if not (&keep..&lastByVar. or missing(&lastByVar.)) then output;
      run;
    %end;
    %else %do;
      data &from._dups;
        set _sorted (rename = (_index_ = _fromRow_));
        by &by.;

        if not &keep..&lastByVar. then output;
      run;
    %end;

    proc sort data = &from._dups;
      by _fromRow_;
    run;

    %SetSystemOptions(notes);
    %put NOTE: [MACRO] Duplicate values output to %upcase(&from._dups).;
    %SetSystemOptions(nonotes);
  %end;

********************************************************************

** Housekeeping

********************************************************************;
  %if &debug. = false %then %do;
    proc delete
      library = work
      data = _indexed
            _less_dups
            _sorted
      ;
    run;
  %end;

  %SetSystemOptions(notes);
  %put NOTE: [MACRO] %eval(%GetObsCount(&from) - %GetObsCount(&out)) duplicates removed from %upcase(&from.).;
  %put NOTE: [MACRO] REMOVEDUPLICATES macro used (Total process time):;

  %let duration = %sysfunc(putn(%sysevalf(%sysfunc(datetime())-&startTime.), time12.3));
  %if %sysfunc(minute("&duration."t)) > 0 %then %do;
    %put NO%str(TE-)         real time            %substr(&duration., 3, 8);
  %end;
  %else %do;
    %put NO%str(TE-)         real time            %substr(&duration., 6, 5) seconds;
  %end;
  %SetSystemOptions(&originalNOTES.);

%mend;

Powered by peut-publier

©2020 Matt Trzcinski