@@ -168,10 +168,9 @@ def add_geovarid(self, input_df):
168168 column_list = list (add_geovarid_df .columns )
169169
170170 # Check if blockid in column list
171- if ('blockid' in column_list ) & \
172- ('block' not in column_list ):
171+ if ('blockid' in column_list ) and ('block' not in column_list ):
173172 print ("Adding Block2010 to column list" )
174- # create column in input df
173+ # create column in input df
175174 # Version 2.0 of HUI renames Block2010 to blockid
176175 add_geovarid_df ['Block2010' ] = add_geovarid_df ['blockid' ]
177176
@@ -180,119 +179,99 @@ def add_geovarid(self, input_df):
180179
181180 # Set geoid FIPS code by concatenating state, county, census geography ids
182181 # Check if geocodes are strings
183- geo_levels = {'State' : {'length' : 2 , 'total_len' : 2 , 'required' : ['state' ] },
184- 'County' : {'length' : 3 , 'total_len' : 5 , 'required' : ['state' ,'county' ]},
185- 'Tract' : {'length' : 6 , 'total_len' : 11 , 'required' : ['state' ,'county' ,'tract' ]},
186- 'BlockGroup' : {'length' : 1 , 'total_len' : 12 , 'required' : ['state' ,'county' ,'tract' ,'blockgroup' ],
187- 'notes' :'Block Group code is first digit of block id' },
188- 'Block' : {'length' : 4 , 'total_len' : 15 , 'required' : ['state' ,'county' ,'tract' ,'block' ]}
189- }
182+ geo_levels = {
183+ 'State' : {'length' : 2 , 'total_len' : 2 , 'required' : ['state' ]},
184+ 'County' : {'length' : 3 , 'total_len' : 5 , 'required' : ['state' , 'county' ]},
185+ 'Tract' : {'length' : 6 , 'total_len' : 11 , 'required' : ['state' , 'county' , 'tract' ]},
186+ 'BlockGroup' : {'length' : 1 , 'total_len' : 12 , 'required' : ['state' , 'county' , 'tract' , 'blockgroup' ],
187+ 'notes' : 'Block Group code is first digit of block id' },
188+ 'Block' : {'length' : 4 , 'total_len' : 15 , 'required' : ['state' , 'county' , 'tract' , 'block' ]}
189+ }
190190
191191 # Name of Geovar to add
192- geovarid = self .geolevel + self .geovintage
192+ geovarid = self .geolevel + self .geovintage
193193
194194 # Check to see what geolevels are available
195195 geolevels_available = []
196196 # Check to see what geovarids are available
197197 geovarids_available = []
198- # Make sure all input variable are correctly zero padded and saved as strings
198+
199+ # Make sure all input variables are correctly zero-padded and saved as strings
199200 for geo_level in geo_levels :
200201 # Geo level needs to be all lower case to match api variables
201202 geo_level_lower = geo_level .lower ()
202203 if geo_level_lower in column_list :
203204 # Each geolevel is a zero padded string
204205 length = geo_levels [geo_level ]['length' ]
205- print ("Check length of" ,geo_level_lower ,"expected length" ,length )
206+ print ("Check length of" , geo_level_lower , "expected length" , length )
206207 check_length = self .check_var_length (
207- input_df = add_geovarid_df ,
208- var = geo_level_lower ,
209- expected_length = length )
208+ input_df = add_geovarid_df ,
209+ var = geo_level_lower ,
210+ expected_length = length )
211+
210212 if (check_length == "Match" ) or \
211- (check_length == "Possible match with zero pad" ):
213+ (check_length == "Possible match with zero pad" ):
212214 # Check variable type
213- # Issue with typ converting to int or float
214215 geo_level_type = add_geovarid_df [geo_level_lower ].dtypes
215- print (geo_level_lower ,"is type" ,geo_level_type )
216- add_geovarid_df .loc [:,geo_level_lower ] = \
217- add_geovarid_df [geo_level_lower ].\
218- apply (lambda x : str (x ).zfill (length ))
216+ print (geo_level_lower , "is type" , geo_level_type )
217+ add_geovarid_df [geo_level_lower ] = add_geovarid_df [geo_level_lower ].astype (str ).str .zfill (length )
219218 geolevels_available .append (geo_level_lower )
220219 geo_level_type = add_geovarid_df [geo_level_lower ].dtypes
221- print ("after update" ,geo_level_lower ,"is type" ,geo_level_type )
220+ print ("after update" , geo_level_lower , "is type" , geo_level_type )
221+
222222 # Check to see what geovarids are available
223- geovarid_test = geo_level + self .geovintage
223+ geovarid_test = geo_level + self .geovintage
224224 if geovarid_test in column_list :
225225 total_length_of_geovar = geo_levels [geo_level ]['total_len' ]
226226 check_length = self .check_var_length (
227- add_geovarid_df ,geovarid_test ,total_length_of_geovar )
227+ add_geovarid_df , geovarid_test , total_length_of_geovar )
228228 if (check_length == "Match" ) or \
229- (check_length == "Possible match with zero pad" ):
230- add_geovarid_df .loc [:,geovarid_test ] = \
231- add_geovarid_df [geovarid_test ].apply (lambda x : str (x ).\
232- zfill (total_length_of_geovar ))
229+ (check_length == "Possible match with zero pad" ):
230+ add_geovarid_df [geovarid_test ] = add_geovarid_df [geovarid_test ].astype (str ).str .zfill (total_length_of_geovar )
233231 geovarids_available .append (geovarid_test )
234232 elif (check_length == "Possible convert to float" ):
235233 print ("Possible convert to float" )
236- add_geovarid_df .loc [:,geovarid_test ] = \
237- add_geovarid_df [geovarid_test ].apply (lambda x : str (x )[:- 2 ].\
238- zfill (total_length_of_geovar ))
234+ add_geovarid_df [geovarid_test ] = add_geovarid_df [geovarid_test ].astype (str ).apply (lambda x : str (x )[:- 2 ].zfill (total_length_of_geovar ))
239235 geovarids_available .append (geovarid_test )
240- print ('Geolevels available' ,geolevels_available )
241- print ('Geolvarids available' ,geovarids_available )
236+
237+ print ('Geolevels available' , geolevels_available )
238+ print ('Geolvarids available' , geovarids_available )
239+
242240 # Generate Geovarid based on available columns
243- # What is the total length expected for the geolevel
244241 total_length_of_geovar = geo_levels [self .geolevel ]['total_len' ]
245- # What are the required input variables
246242 required_vars = geo_levels [self .geolevel ]['required' ]
247243
248- print ('Adding' ,geovarid ,'expected length' ,total_length_of_geovar )
249- # Check to make sure that all columns needed are in list
244+ print ('Adding' , geovarid , 'expected length' , total_length_of_geovar )
245+
250246 if all (cols in column_list for cols in geolevels_available ) and \
251- all (cols in column_list for cols in required_vars ) and \
252- (geolevels_available == required_vars ) and \
253- (geolevels_available != []):
254- print ('Dataframe has required geo levels' ,geolevels_available )
255- # Set geovarid to empty
256- add_geovarid_df .loc [:,geovarid ] = ''
247+ all (cols in column_list for cols in required_vars ) and \
248+ (geolevels_available == required_vars ) and \
249+ (geolevels_available != []):
250+ print ('Dataframe has required geo levels' , geolevels_available )
251+ add_geovarid_df [geovarid ] = ''
257252 for geo_level in required_vars :
258253 geo_level_type = add_geovarid_df [geo_level .lower ()].dtypes
259- print (geo_level .lower (),"is type" ,geo_level_type )
260- # Add geo level to geovarid
261- add_geovarid_df .loc [:,geovarid ] = add_geovarid_df [geovarid ] + \
262- add_geovarid_df [geo_level .lower ()]
263- # If geolevel columns are not in list check if block id is in list
264- elif 'Block' + self .geovintage in geovarids_available :
265- print ('Dataframe has Block' ,self .geovintage ,'for new geovar' ,geovarid )
266- # Check that the block id is a zero padded 15 digit string
267- # The geovarid is the first x characters
268- add_geovarid_df .loc [:,geovarid ] = add_geovarid_df ['Block' + self .geovintage ].\
269- apply (lambda x : str (int (x )).zfill (15 )[0 :total_length_of_geovar ])
270- elif 'Tract' + self .geovintage in geovarids_available :
271- print ('Dataframe has Tract' ,self .geovintage ,'for new geovar' ,geovarid )
272- # Check that the tract id is a zero padded 11 digit string
273- # The geovarid is the first x characters
274- #print('Before update confirm',geovarid,'has expected length.')
275- #self.check_var_length(add_geovarid_df,geovarid,total_length_of_geovar)
276- add_geovarid_df .loc [:,geovarid ] = add_geovarid_df ['Tract' + self .geovintage ].\
277- apply (lambda x : str (int (x )).zfill (11 )[0 :total_length_of_geovar ])
278- #print('After update confirm',geovarid,'has expected length.')
279- #self.check_var_length(add_geovarid_df,geovarid,total_length_of_geovar)
254+ print (geo_level .lower (), "is type" , geo_level_type )
255+ add_geovarid_df [geovarid ] += add_geovarid_df [geo_level .lower ()]
256+ elif 'Block' + self .geovintage in geovarids_available :
257+ print ('Dataframe has Block' , self .geovintage , 'for new geovar' , geovarid )
258+ add_geovarid_df [geovarid ] = add_geovarid_df ['Block' + self .geovintage ].astype (str ).str .zfill (15 ).str [:total_length_of_geovar ]
259+ elif 'Tract' + self .geovintage in geovarids_available :
260+ print ('Dataframe has Tract' , self .geovintage , 'for new geovar' , geovarid )
261+ add_geovarid_df [geovarid ] = add_geovarid_df ['Tract' + self .geovintage ].astype (str ).str .zfill (11 ).str [:total_length_of_geovar ]
280262 elif 'GEO_ID' in column_list :
281- # GEO_ID has the FIPS code data using the substring
282- print ('Dataframe has GEO_ID for new geovar' ,geovarid )
283- add_geovarid_df .loc [:,geovarid ] = add_geovarid_df ['GEO_ID' ].\
284- apply (lambda x : str (x ).zfill (11 )[x .find ("US" )+ 2 :\
285- total_length_of_geovar + x .find ("US" )+ 2 ])
263+ print ('Dataframe has GEO_ID for new geovar' , geovarid )
264+ add_geovarid_df [geovarid ] = add_geovarid_df ['GEO_ID' ].astype (str ).apply (lambda x : x .zfill (11 )[x .find ("US" ) + 2 :total_length_of_geovar + x .find ("US" ) + 2 ])
286265 else :
287- print ('Warning: Column list does not have required columns to make' ,geovarid )
266+ print ('Warning: Column list does not have required columns to make' , geovarid )
288267
289268 # Update column list to move geovarid to front
290269 columnlist = [col for col in add_geovarid_df if col != geovarid ]
291- new_columnlist = [geovarid ]+ columnlist
292- # Confirm geovarid is set correctly
293- print ('Confirming' ,geovarid ,'has expected length.' )
270+ new_columnlist = [geovarid ] + columnlist
271+
272+ print ('Confirming' , geovarid , 'has expected length.' )
294273 check_length = self .check_var_length (
295- add_geovarid_df ,geovarid ,total_length_of_geovar )
274+ add_geovarid_df , geovarid , total_length_of_geovar )
296275 return add_geovarid_df [new_columnlist ]
297276
298277
@@ -969,9 +948,9 @@ def run_random_merge_2dfs(self, rounds):
969948
970949 if self .savefiles == True :
971950 print ("Save primary and secondary files with all columns" )
972- savefile = sys .path [ 0 ] + "/" + csv_filepath_primary
951+ savefile = os .path . join ( os . getcwd (), csv_filepath_primary )
973952 output_df ['primary' ].to_csv (savefile , index = False )
974- savefile = sys .path [ 0 ] + "/" + csv_filepath_secondary
953+ savefile = os .path . join ( os . getcwd (), csv_filepath_secondary )
975954 output_df ['secondary' ].to_csv (savefile , index = False )
976955
977956 return output_df
@@ -1008,10 +987,10 @@ def run_random_merge_2dfs(self, rounds):
1008987 print ("Check primary and secondary files to understand why merge is not complete" )
1009988 if self .savefiles == True :
1010989 csv_filepath_primary_almost = self .outputfolder + "/" + csv_filename_primary + '_almost.csv'
1011- savefile = sys .path [ 0 ] + "/" + csv_filepath_primary_almost
990+ savefile = os .path . join ( os . getcwd (), csv_filepath_primary_almost )
1012991 output_df ['primary' ].to_csv (savefile , index = False )
1013992 csv_filepath_secondary_almost = self .outputfolder + "/" + csv_filename_secondary + '_almost.csv'
1014- savefile = sys .path [ 0 ] + "/" + csv_filepath_secondary_almost
993+ savefile = os .path . join ( os . getcwd (), csv_filepath_secondary_almost )
1015994 output_df ['secondary' ].to_csv (savefile , index = False )
1016995 return output_df
1017996
0 commit comments