Wednesday, February 3, 2016

Friday, July 10, 2015

working w/ microdata, recoding race.

*This is a big document to recode race and ethnicity;
*there are now way fewer categories;
*it also includes a way to sort by state;

PROC IMPORT OUT= WORK.recode_race
            DATAFILE= "\\tsclient\Y\recoding_race_in_pumas_2000\recode_race.csv"
            DBMS=CSV REPLACE;
     GETNAMES=YES;
     DATAROW=2;
RUN;

*make pumas 5 digits- keep leading zeros;
DATA file_name2;
SET recode_race;
PUMA = PUT(US00A_PUMA, z5.);
RUN;

*make statefip 2 digits (keep a leading zero);
options obs=max;
DATA file_name2;
SET file_name2;
FIP = PUT(US00A_STATEFIP, z2.);
RUN;

*now i want to turn my NIU's (9999999) and (9999998) into nulls;
options obs=max;
data file_name2;
set file_name2;
if inctot = 9999999 then inctot =.;
if inctot = 9999999 then inctot =.;
run;

*rename puma to puma1 to stayclean;
options obs=max;
data file_name2;
set file_name2;
rename PUMA=PUMA1;run;

*concatenate fip+puma1 to make puma;
options obs=max;
data file_name2;
set file_name2;
PUMA=catt(fip,puma1);run;

*make sure puma is 7 digit;
*drop puma1;
*it doesn't seem to like z7 here, think it's a char$ v.s #(number) issue);
options obs=max;
DATA file_name2;
SET file_name2;
PUMA = PUT(PUMA, z7.);run;
options obs=max;
DATA file_name2;
SET file_name2;
drop puma1;
RUN;

*I want to create a table of means;
*I want it based on a puma level;
*first I want to format so that my gender is clear, my edattan is clear, my race is clear;

proc format;
value rac
10='White'
20='Black'
21='Black African'
22='Black Caribbean'
23='Afro-Ecuadorian'
24='Other Black'
30='Indigenous'
31='American Indian'
40='Asian'
41='Chinese'
42='Japanese'
43='Korean'
44='Vietnamese'
45='Filipino'
46='Indian'
47='Pakistani'
48='Bangladeshi'
49='Other Asian'
50='Mixed race'
51='Brown (Brazil)'
52='Mestizo (Indigenous and White)'
53='Mulatto (Black and White) (Ecuador)'
54='Coloured (South Africa)'
55='Two or more races'
60='Other'
61='Montubio (Ecuador)'
99='Unknown';


value educ
0='NIU (not in universe)'
1='Less than primary completed'
2='Primary completed'
3='Secondary completed'
4='University completed'
9='Unknown';


value sx
9='Unknown'
2='Female'
1='Male';run;


proc format;
value fips
01='Alabama'
02='Alaska'
04='Arizona'
05='Arkansas'
06='California'
08='Colorado'
09='Connecticut'
10='Delaware'
11='District of Columbia'
12='Florida'
13='Georgia'
15='Hawaii'
16='Idaho'
17='Illinois'
18='Indiana'
19='Iowa'
20='Kansas'
21='Kentucky'
22='Louisiana'
23='Maine'
24='Maryland'
25='Massachusetts'
26='Michigan'
27='Minnesota'
28='Mississippi'
29='Missouri'
30='Montana'
31='Nebraska'
32='Nevada'
33='New Hampshire'
34='New Jersey'
35='New Mexico'
36='New York'
37='North Carolina'
38='North Dakota'
39='Ohio'
40='Oklahoma'
41='Oregon'
42='Pennsylvania'
44='Rhode island'
45='South Carolina'
46='South Dakota'
47='Tennessee'
48='Texas'
49='Utah'
50='Vermont'
51='Virginia'
53='Washington'
54='West Virginia'
55='Wisconsin'
56='Wyoming';

proc format;
value HISP
000='Not hispanic'
99='Hispanic'
100='Mexican'
101='Mexican, n.e.c.'
102='Mexican American'
103='Mexicano/Mexicana'
104='Chicano/Chicana'
105='La Raza'
106='Mexican American Indian'
107='Mexico'
200='Puerto Rican'
300='Cuban'
400='Other Spanish, 1980'
410='Central/South American, 1970'
411='Costa Rican'
412='Guatemalan'
413='Honduran'
414='Nicaraguan'
415='Panamanian'
416='Salvadoran'
417='Central American'
418='Central American Indian'
419='Canal Zone'
420='Argentinean'
421='Bolivian'
422='Chilean'
423='Colombian'
424='Ecuadorian'
425='Paraguayan'
426='Peruvian'
427='Uruguayan'
428='Venezuelan'
429='South American Indian'
430='Criollo'
431='South American'
440='Other Spanish, 1970'
450='Spaniard'
451='Andalusian'
452='Asturian'
453='Castillian'
454='Catalonian'
455='Balearic Islander'
456='Gallego'
457='Valencian'
458='Canarian'
459='Spanish Basque'
460='Dominican'
465='Latin American'
470='Hispanic'
480='Spanish'
490='Californio'
491='Tejano'
492='Nuevo Mexicano'
493='Spanish American'
494='Spanish American Indian'
495='Meso American Indian'
496='Mestizo'
497='Other Spanish, Hispanic, Latino'
498='Not specified (FOSDIC)'
499='Not classified'
999='Not reported'; run;

*recode fip to numeric;
options obs = max;
data Twincities2010;
set Twincities2010;
newfip = input(fip,2.);
run;



*to make a data set for minn 2000 ONLY;
*you make ;
data raceMN;
set work.file_name2;
if fip = 27;
run;

*write the 2000 data for mn to a csv;
proc export data=work.raceMN
   outfile="\\tsclient\Y\recoding_race_in_pumas_2000\MNrace.csv"
   dbms=csv
   replace;run;

 
*lose some unneccessary variables;
*us;
data Racemn;
set work.Racemn;
drop wtper sample cntry resident marstd edattand us00A_puma;
run;

*write the 2000 data for mn to a csv;
proc export data=work.raceMN
   outfile="\\tsclient\Y\recoding_race_in_pumas_2000\MNrace.csv"
   dbms=csv
   replace;run;


*to make a data set for CA 2000 ONLY;
*you make ;
data raceCA;
set work.file_name2;
if fip = 06;
run;

*lose some unneccessary variables;
*us;
data Raceca;
set work.Raceca;
drop wtper sample cntry resident marstd edattand us00A_puma;
run;

*write the 2000 data for ca to a csv;
proc export data=work.raceCA
   outfile="\\tsclient\Y\recoding_race_in_pumas_2000\CArace.csv"
   dbms=csv
   replace;run;


*to make a data set for twin cities 2000 ONLY;
*you make ;
data twincities2010;
set work.Racemn;
drop US00A_STATEFIP newfip;
if puma = 2701401 or puma = 2701402 or puma = 2701403 or puma = 2701404 or puma = 2701405
or puma=2701301 or puma = 2701302 or puma = 2701303 or puma = 2701406 or puma = 2701601 or puma = 2701602 or puma = 2701501 or puma = 2701502;
run;

*recode fip to numeric;
options obs = max;
data Twincities2010;
set Twincities2010;
newfip = input(fip,2.);
run;

*make a new code for hisp2 that combines categories;
data twincities2010;
set twincities2010;
hisp2 =0;
if (hispan >=100) and (hispan <=107) then hisp2=99;
if (hispan >=108) and (hispan <=497) then hisp2=99;
run;

*make a newcode3 for race2 that combines categories;
data twincities2010;
set twincities2010;
race2 =60;
if (race =  10) then race2 =10;
if (race >= 20) and (race <=24) then race2 =20;
if (race = 30) or (race=31) then race2 = 30;
if (race >= 40) and (race <=49) then race2=40;
if (race >=50) and (race <=55) then race2=55;
run;


*tabulate twin cities, and export;
   *this will include code to export;
*it also includes the creation of a nifty table;
*that will be exported;
*you can skip this if you just want to make a;
*csv for export;

ods tagsets.excelxp
file="\\tsclient\Y\recoding_race_in_pumas_2000\TwinCitiesRace.xls"
style = minimal
options (orientation = 'landscape'
fittopage = 'yes'
pages_fitwidth = '1'
pages_fitheight = '100');


*now the proc tab code;
options obs = max;
proc tabulate data = Twincities2010;
class   newfip puma edattan race2 sex hisp2;
var inctot;
table newfip = 'state' * puma = 'PUMA area'  , edattan='Edattan' * (rowpctn) race2='race' * (rowpctn) sex ='sex' * (rowpctn) hisp2 = 'hispanic' * (rowpctn) inctot = 'income' * (mean);
format newfip fips. sex sx. hisp2 HISP. edattan educ. race2 rac. inctot;
run;

ods tagsets.excelxp close;

Wednesday, July 8, 2015

Working with MicroData, Creating Pumas

*upload your .csv file that you generated as an ipums extract;
*in ipums you can select for a state- this code assumes that you have done just that;
*this code works to make big table of all pumas for whatever csv you start with;
*check your leading zeros in .xls;
*remember to include statefip in your download;

PROC IMPORT OUT= WORK.file_name1
            DATAFILE= "filepath.csv"
            DBMS=CSV REPLACE;
     GETNAMES=YES;
     DATAROW=2;
RUN;

*make pumas 5 digits- keep leading zeros;
DATA file_name2;
SET file_name1;
PUMA = PUT(US00A_PUMA, z5.);
RUN;

*make statefip 2 digits (keep a leading zero);
options obs=max;
DATA file_name2;
SET file_name2;
FIP = PUT(US00A_STATEFIP, z2.);
RUN;

*now i want to turn my NIU's (9999999) and (9999998) into nulls;
*this is specific to my data cet, but shows how to make a value a null;
options obs=max;
data file_name2;
set file_name2;
if inctot = 9999999 then inctot =.;
if inctot = 9999999 then inctot =.;
run;

*rename puma to puma1 to stayclean;
*rename a variable;
options obs=max;
data file_name2;
set file_name2;
rename PUMA=PUMA1;run;

*concatenate fip+puma1 to make puma;
options obs=max;
data file_name2;
set file_name2;
PUMA=catt(fip,puma1);run;

*make sure puma is 7 digit;
*drop puma1;
*it doesn't seem to like z7 here, think it's a char$ v.s #(number) issue);
*also dropping some variables here;
options obs=max;
DATA file_name2;
SET file_name2;
PUMA = PUT(PUMA, z7.);run;
options obs=max;
DATA file_name2;
SET file_name2;
drop puma1 geolev1;
RUN;

*I want to create a table of means;
*I want it based on a puma level;
*first I want to format so that my gender is clear, my edattan is clear, my race is clear;

proc format;
value rac
10='White'
20='Black'
21='Black African'
22='Black Caribbean'
23='Afro-Ecuadorian'
24='Other Black'
30='Indigenous'
31='American Indian'
40='Asian'
41='Chinese'
42='Japanese'
43='Korean'
44='Vietnamese'
45='Filipino'
46='Indian'
47='Pakistani'
48='Bangladeshi'
49='Other Asian'
50='Mixed race'
51='Brown (Brazil)'
52='Mestizo (Indigenous and White)'
53='Mulatto (Black and White) (Ecuador)'
54='Coloured (South Africa)'
55='Two or more races'
60='Other'
61='Montubio (Ecuador)'
99='Unknown';


value educ
0='NIU (not in universe)'
1='Less than primary completed'
2='Primary completed'
3='Secondary completed'
4='University completed'
9='Unknown';


value sx
9='Unknown'
2='Female'
1='Male';run;


proc format;
value fips
01='Alabama'
02='Alaska'
04='Arizona'
05='Arkansas'
06='California'
08='Colorado'
09='Connecticut'
10='Delaware'
11='District of Columbia'
12='Florida'
13='Georgia'
15='Hawaii'
16='Idaho'
17='Illinois'
18='Indiana'
19='Iowa'
20='Kansas'
21='Kentucky'
22='Louisiana'
23='Maine'
24='Maryland'
25='Massachusetts'
26='Michigan'
27='Minnesota'
28='Mississippi'
29='Missouri'
30='Montana'
31='Nebraska'
32='Nevada'
33='New Hampshire'
34='New Jersey'
35='New Mexico'
36='New York'
37='North Carolina'
38='North Dakota'
39='Ohio'
40='Oklahoma'
41='Oregon'
42='Pennsylvania'
44='Rhode island'
45='South Carolina'
46='South Dakota'
47='Tennessee'
48='Texas'
49='Utah'
50='Vermont'
51='Virginia'
53='Washington'
54='West Virginia'
55='Wisconsin'
56='Wyoming';

*recode fip to numeric;
data file_name2;
set file_name2;
newfip = input(fip,2.);
run;

*this will include code to export;
*it also includes the creation of a nifty table;
*that will be exported;
*you can skip this if you just want to make a;
*csv for export;

ods tagsets.excelxp
file="\\tsclient\Y\ipumsCodap\Ipums_and_Pumas\export_name.xls"
style = minimal
options (orientation = 'landscape'
fittopage = 'yes'
pages_fitwidth = '1'
pages_fitheight = '100');


*now the proc tab code;
options obs = 100;
proc tabulate data = file_name2;
class   newfip puma edattan raceus sex;
var inctot;
table newfip = 'state' * puma = 'PUMA area'  , edattan='Edattan' * (n rowpctn) raceus='race' sex ='sex' inctot = 'income' * (mean);
format newfip fips. sex sx. edattan educ. raceus rac. inctot;
run;

*lose some unneccessary variables;
*us;
data file_name2;
set work.file_name2;
drop wtper sample cntry resident marstd edattand us00A_puma;
run;


*to make a data set for BERKELEY 2000 ONLY;
*you make ;
data BERKDAT00;
set work.file_name2;
drop US00A_STATEFIP newfip;
if puma = 0602401;
run;

*write the 2000 data for BERKELEY to a csv;
proc export data=work.BERKDAT00
   outfile="\\tsclient\Y\ipumsCodap\Ipums_and_Pumas\berkeley_puma_ca_00.csv"
   dbms=csv
   replace;run;


*make a data set for MARIN 2000 ONLY;
data MARINDAT00;
set work.file_name2;
drop US00A_STATEFIP newfip;
if puma = 0601201 or  puma = 0601202;
run;

*make a data set for OTOWN 2000 ONLY;
data OAKLANDDAT00;
set work.file_name2;
if puma = 0602402 or  puma = 0602403 or puma = 0602404;
drop US00A_STATEFIP newfip;
run;

*make a data set for part of tulare counte;
data tulare_part00;
set work.file_name2;
if puma = 0603503;
drop US00A_STATEFIP newfip;
run;

*make a data set for all of tulare counte;
data tulare_all00;
set work.file_name2;
if puma = 0603503 or puma= 0603501 or puma=0603502;
drop US00A_STATEFIP newfip;
run;

*write the 2000 data for BERKELEY to a csv;
proc export data=work.BERKDAT00
   outfile="\\tsclient\Y\ipumsCodap\Ipums_and_Pumas\berkeley_puma_ca_00.csv"
   dbms=csv
   replace;run;

*write the 2000 data for tulare to a csv;
proc export data=work.tulare_part00
   outfile="\\tsclient\Y\ipumsCodap\Ipums_and_Pumas\tulare_part_00.csv"
   dbms=csv
   replace;run;

*write the 2000 data for tulare_all to a csv;
proc export data=work.tulare_all00
   outfile="\\tsclient\Y\ipumsCodap\Ipums_and_Pumas\tulare_all_00.csv"
   dbms=csv
   replace;run;

*write the 2000 data for OAKLAND to a csv;
proc export data=work.OAKLANDDAT00
   outfile="\\tsclient\Y\ipumsCodap\Ipums_and_Pumas\otown_puma_ca_00.csv"
   dbms=csv
   replace;run;

   *write the 2000 data for MARIN to a csv;
proc export data=work.MARINDAT00
   outfile="\\tsclient\Y\ipumsCodap\Ipums_and_Pumas\marin_puma_ca_00.csv"
   dbms=csv
   replace;run;


Wednesday, June 24, 2015

SAS CODE - "DROP VARIABLE"

*this code snippet takes data demos1, and drops 3 unnecessary ;
*variables from it;
*obs =max is a safe thing to put in if you've been messing with less obs;

options obs = max;
data demos1;
set work.demos1;
drop wtper sample cntry;
run;

SAS CODE "Simple Random Sample and Export"

*This following piece of sas imports the really long and wide file;
* terrapop_extract_1016.csv, and randomly draws 5000 households;
* out of it. In this case, a household is determined by a shared serial;
* number -- everyone in the household has the same serial number,;
* so 5000 households exports as approximately 11000 people.;
*seed is part of the randomization process, srs is simple random sample;

PROC IMPORT OUT= WORK.demos 
            DATAFILE= "filepath.csv" 
            DBMS=CSV REPLACE;
     GETNAMES=YES;
     DATAROW=2; 
RUN;

  proc surveyselect data= demos out=demos1 method=srs sampsize=5000 seed=377183 noprint;
samplingunit serial;
        run;
*this is my 5000 person random sample;
options obs=max;
proc export data=work.demos1
   outfile='Y:\demos1.csv'
   dbms=csv
   replace;run;


Here is a little snapshot of the data...




















































44135083 201896 6 840001 840001 0 2 999 12000 20 350 11 1 2.65E+08 4583402 99 110 1 2 2 100 5 8407 2 45 0 11900 2010