Don't fear the data: working w/ microdata, recoding race.

*This is a big document to recode race and ethnicity;
*there are now way fewer categories;
*it also includes a way to sort by state;

PROC IMPORT OUT= WORK.recode_race
DATAFILE= "\\tsclient\Y\recoding_race_in_pumas_2000\recode_race.csv"
DBMS=CSV REPLACE;
GETNAMES=YES;
DATAROW=2;
RUN;

*make pumas 5 digits- keep leading zeros;
DATA file_name2;
SET recode_race;
PUMA = PUT(US00A_PUMA, z5.);
RUN;

*make statefip 2 digits (keep a leading zero);
options obs=max;
DATA file_name2;
SET file_name2;
FIP = PUT(US00A_STATEFIP, z2.);
RUN;

*now i want to turn my NIU's (9999999) and (9999998) into nulls;
options obs=max;
data file_name2;
set file_name2;
if inctot = 9999999 then inctot =.;
if inctot = 9999999 then inctot =.;
run;

*rename puma to puma1 to stayclean;
options obs=max;
data file_name2;
set file_name2;
rename PUMA=PUMA1;run;

*concatenate fip+puma1 to make puma;
options obs=max;
data file_name2;
set file_name2;
PUMA=catt(fip,puma1);run;

*make sure puma is 7 digit;
*drop puma1;
*it doesn't seem to like z7 here, think it's a char$ v.s #(number) issue);
options obs=max;
DATA file_name2;
SET file_name2;
PUMA = PUT(PUMA, z7.);run;
options obs=max;
DATA file_name2;
SET file_name2;
drop puma1;
RUN;

*I want to create a table of means;
*I want it based on a puma level;
*first I want to format so that my gender is clear, my edattan is clear, my race is clear;

proc format;
value rac
10='White'
20='Black'
21='Black African'
22='Black Caribbean'
23='Afro-Ecuadorian'
24='Other Black'
30='Indigenous'
31='American Indian'
40='Asian'
41='Chinese'
42='Japanese'
43='Korean'
44='Vietnamese'
45='Filipino'
46='Indian'
47='Pakistani'
48='Bangladeshi'
49='Other Asian'
50='Mixed race'
51='Brown (Brazil)'
52='Mestizo (Indigenous and White)'
53='Mulatto (Black and White) (Ecuador)'
54='Coloured (South Africa)'
55='Two or more races'
60='Other'
61='Montubio (Ecuador)'
99='Unknown';

value educ
0='NIU (not in universe)'
1='Less than primary completed'
2='Primary completed'
3='Secondary completed'
4='University completed'
9='Unknown';

value sx
9='Unknown'
2='Female'
1='Male';run;

proc format;
value fips
01='Alabama'
02='Alaska'
04='Arizona'
05='Arkansas'
06='California'
08='Colorado'
09='Connecticut'
10='Delaware'
11='District of Columbia'
12='Florida'
13='Georgia'
15='Hawaii'
16='Idaho'
17='Illinois'
18='Indiana'
19='Iowa'
20='Kansas'
21='Kentucky'
22='Louisiana'
23='Maine'
24='Maryland'
25='Massachusetts'
26='Michigan'
27='Minnesota'
28='Mississippi'
29='Missouri'
30='Montana'
31='Nebraska'
32='Nevada'
33='New Hampshire'
34='New Jersey'
35='New Mexico'
36='New York'
37='North Carolina'
38='North Dakota'
39='Ohio'
40='Oklahoma'
41='Oregon'
42='Pennsylvania'
44='Rhode island'
45='South Carolina'
46='South Dakota'
47='Tennessee'
48='Texas'
49='Utah'
50='Vermont'
51='Virginia'
53='Washington'
54='West Virginia'
55='Wisconsin'
56='Wyoming';

proc format;
value HISP
000='Not hispanic'
99='Hispanic'
100='Mexican'
101='Mexican, n.e.c.'
102='Mexican American'
103='Mexicano/Mexicana'
104='Chicano/Chicana'
105='La Raza'
106='Mexican American Indian'
107='Mexico'
200='Puerto Rican'
300='Cuban'
400='Other Spanish, 1980'
410='Central/South American, 1970'
411='Costa Rican'
412='Guatemalan'
413='Honduran'
414='Nicaraguan'
415='Panamanian'
416='Salvadoran'
417='Central American'
418='Central American Indian'
419='Canal Zone'
420='Argentinean'
421='Bolivian'
422='Chilean'
423='Colombian'
424='Ecuadorian'
425='Paraguayan'
426='Peruvian'
427='Uruguayan'
428='Venezuelan'
429='South American Indian'
430='Criollo'
431='South American'
440='Other Spanish, 1970'
450='Spaniard'
451='Andalusian'
452='Asturian'
453='Castillian'
454='Catalonian'
455='Balearic Islander'
456='Gallego'
457='Valencian'
458='Canarian'
459='Spanish Basque'
460='Dominican'
465='Latin American'
470='Hispanic'
480='Spanish'
490='Californio'
491='Tejano'
492='Nuevo Mexicano'
493='Spanish American'
494='Spanish American Indian'
495='Meso American Indian'
496='Mestizo'
497='Other Spanish, Hispanic, Latino'
498='Not specified (FOSDIC)'
499='Not classified'
999='Not reported'; run;

*recode fip to numeric;
options obs = max;
data Twincities2010;
set Twincities2010;
newfip = input(fip,2.);
run;

*to make a data set for minn 2000 ONLY;
*you make ;
data raceMN;
set work.file_name2;
if fip = 27;
run;

*write the 2000 data for mn to a csv;
proc export data=work.raceMN
outfile="\\tsclient\Y\recoding_race_in_pumas_2000\MNrace.csv"
dbms=csv
replace;run;

*lose some unneccessary variables;
*us;
data Racemn;
set work.Racemn;
drop wtper sample cntry resident marstd edattand us00A_puma;
run;

*write the 2000 data for mn to a csv;
proc export data=work.raceMN
outfile="\\tsclient\Y\recoding_race_in_pumas_2000\MNrace.csv"
dbms=csv
replace;run;

*to make a data set for CA 2000 ONLY;
*you make ;
data raceCA;
set work.file_name2;
if fip = 06;
run;

*lose some unneccessary variables;
*us;
data Raceca;
set work.Raceca;
drop wtper sample cntry resident marstd edattand us00A_puma;
run;

*write the 2000 data for ca to a csv;
proc export data=work.raceCA
outfile="\\tsclient\Y\recoding_race_in_pumas_2000\CArace.csv"
dbms=csv
replace;run;

*to make a data set for twin cities 2000 ONLY;
*you make ;
data twincities2010;
set work.Racemn;
drop US00A_STATEFIP newfip;
if puma = 2701401 or puma = 2701402 or puma = 2701403 or puma = 2701404 or puma = 2701405
or puma=2701301 or puma = 2701302 or puma = 2701303 or puma = 2701406 or puma = 2701601 or puma = 2701602 or puma = 2701501 or puma = 2701502;
run;

*recode fip to numeric;
options obs = max;
data Twincities2010;
set Twincities2010;
newfip = input(fip,2.);
run;

*make a new code for hisp2 that combines categories;
data twincities2010;
set twincities2010;
hisp2 =0;
if (hispan >=100) and (hispan <=107) then hisp2=99;
if (hispan >=108) and (hispan <=497) then hisp2=99;
run;

*make a newcode3 for race2 that combines categories;
data twincities2010;
set twincities2010;
race2 =60;
if (race = 10) then race2 =10;
if (race >= 20) and (race <=24) then race2 =20;
if (race = 30) or (race=31) then race2 = 30;
if (race >= 40) and (race <=49) then race2=40;
if (race >=50) and (race <=55) then race2=55;
run;

*tabulate twin cities, and export;
*this will include code to export;
*it also includes the creation of a nifty table;
*that will be exported;
*you can skip this if you just want to make a;
*csv for export;

ods tagsets.excelxp
file="\\tsclient\Y\recoding_race_in_pumas_2000\TwinCitiesRace.xls"
style = minimal
options (orientation = 'landscape'
fittopage = 'yes'
pages_fitwidth = '1'
pages_fitheight = '100');

*now the proc tab code;
options obs = max;
proc tabulate data = Twincities2010;
class newfip puma edattan race2 sex hisp2;
var inctot;
table newfip = 'state' * puma = 'PUMA area' , edattan='Edattan' * (rowpctn) race2='race' * (rowpctn) sex ='sex' * (rowpctn) hisp2 = 'hispanic' * (rowpctn) inctot = 'income' * (mean);
format newfip fips. sex sx. hisp2 HISP. edattan educ. race2 rac. inctot;
run;

ods tagsets.excelxp close;

Don't fear the data

Friday, July 10, 2015

working w/ microdata, recoding race.

No comments:

Post a Comment