Online File

How to use this page


Rick Aster: Professional SAS Programming Shortcuts: Contents

Chapter 82
Program
Text analysis


data work.word (keep=word wordtext length sentence sn)
    / view=work.word;
   length wordtext word $ 24 endchar $ 1;
   retain sentenceend 0 sentence 1;
   infile text flowover;
   input wordtext @@;

   *
     Word: convert word text to uppercase
     and remove most punctuation.
   *;
   word = upcase(compress(wordtext,
       '!"#$%&()*+,-./:;<=>?@[\]^`{|}~'));
   length = length(compress(word, "'-"));

   if word ne '' then do;
      * Check for sentence break. ;
      if sentenceend then do;
         * If first letter is uppercase, start new sentence. ;
         li = anyalpha(word);
         if li then letter = substr(word, li);
         else letter = ' ';
         if anyalpha(letter) then do;
            sentence + 1;
            sn = 0;
            end;
         else sentenceend = 0;
         end;
      n + 1;
      sn + 1;
      output;
      end;

   *
      Possible end of sentence: word text ends in period,
      exclamation point, or question mark, possibly followed
      by quotation marks.
   *;
   if sn > 0 then do;
      endchari = length(translate(wordtext, '  ', '"'''));
      endchar = substr(wordtext, endchari, 1);
      sentenceend = endchar in ('.', '!', '?');
      end;
run;

data work.sentence (keep=sentence sn rename=(sn=slength));
   set work.word;
   by sentence;
   if last.sentence;
run;

proc summary data=work.word;
   var length;
   output mean= out=work.wsum (rename=(_freq_=wcount));
run;
proc summary data=work.sentence;
   var slength;
   output mean= out=work.ssum (rename=(_freq_=scount));
run;
title1 'Thoreau Paragraph';
data _null_;
   set work.wsum;
   set work.ssum;
   file print;
   put / 'Word count: ' wcount : comma9.
       / 'Average word length: ' length : 5.2
       / 'Sentence count: ' scount : comma7.
       / 'Words per sentence: ' slength : 6.2;
run;

proc summary data=work.word order=freq;
   class word;
   output out=work.wordlist1 (rename=(_freq_=n));
run;
data wordlist (keep=word n percent);
   if _n_ = 1 then set work.wordlist1 (where=(_type_ = 0)
       rename=(n=wordcount));
   set work.wordlist1 (where=(_type_ > 0));
   percent = n/wordcount*100;
run;
proc print data=wordlist (where=(percent >= 1))
    heading=horizontal noobs;
   var word n percent;
   format n comma6. percent f7.3;
run;

 O /\

Global
Statements

RICK ASTER

SAS

BOOKS

Tech | Dictionary

Download | Rastinate

Rick Aster

Professional SAS Programming Shortcuts

Contents/Online Files

Corrections

Catalog Page