Select Git revision
bilbao_preprocess.py
bilbao_preprocess.py 7.21 KiB
#!/usr/bin/env python
# coding: utf-8
# ### EUROSTAT data - sample data
#
# Description of EUROSTAT variables values:
#
# 1. (RB090)gender: 1- male, 2-female
# 2. (RB080)age
# 3. (PD060) Regularly participate in a leisure activity:
# 1 Yes
# No - cannot afford it
# No - other reason
# 4. (PL031) Self-defined current economic status:
# 1 Employee working full-time
# 2 Employee working part-time
# 3 Self-employed working full-time (including family worker)
# 4 Self-employed working part-time (including family worker)
# 5 Unemployed
# 6 Pupil, student, further training, unpaid work experience
# 7 In retirement or in early retirement or has given up business
# 8 Permanently disabled or/and unfit to work
# 9 In compulsory military or community service
# 10 Fulfilling domestic tasks and care responsibilities
# 11 Other inactive person
# 5. (PE040) Highest ISCED level attained:
# 0 pre-primary education
# 1 primary education
# 2 lower secondary education
# 3 (upper) secondary education
# 4 post-secondary non tertiary education
# 5 first stage of tertiary education (not leading directly to an
# advanced research qualification)
# 6 second stage of tertiary
# In[ ]:
import pandas as pd
import numpy as np
from os.path import join as joinpath
data_path = joinpath("data", "bilbao", "before_simulation")
sample = joinpath(data_path)
household = "ES_2013h_EUSILC.csv"
personal = "ES_2013p_EUSILC.csv"
household_register = "ES_2013d_EUSILC.csv"
personal_register = "ES_2013r_EUSILC.csv"
# variables we need from files
d_var = ["DB030", "DB040", "DB090"] # hhid, federal state, hh weight
# person's id, REGULARLY PARTICIPATE IN A LEISURE ACTIVITY, SELF-DEFINED CURRENT ECONOMIC STATUS,HIGHEST ISCED LEVEL ATTAINED
p_var = ["PB030", "PD060", "PL031", "PE040"]
# person's id, person's gender, weights, year of birth
r_var = ["RB030", "RB090", "RB050", "RB080"]
# read data and merge
h = pd.read_csv(joinpath(sample, household))
p = pd.read_csv(joinpath(sample, personal))[p_var]
d = pd.read_csv(joinpath(sample, household_register))[d_var]
r = pd.read_csv(joinpath(sample, personal_register))[r_var]
r["hhid"] = (np.floor(r["RB030"]/10)).astype(int)
r["RB080"] = 2021-r["RB080"]
r = r.merge(d, left_on="hhid", right_on="DB030")
#r = r.merge(p,left_on = "RB030", right_on="PB030")
ind = r[["RB090", "RB080"]]
ind = ind.rename(columns={"RB090": "gender", "RB080": "age"})
ind["gender"] = ind["gender"].replace({2: "f", 1: "m"})
ind.to_csv(joinpath(data_path, "ind.csv"), sep=",")
# ### Aggregated data
#
# Preprocess it to fit the IPF algorithm
# In[ ]:
cons = pd.read_csv(joinpath(data_path, "numero_habitantes_distrito_barrio_edad_2020.csv"), sep=';')
cons = cons.drop(cons.columns[[0,3,4]],1)
# save totals column
cons = cons[1:46].groupby(cons.columns[1])[cons.columns[2:]].sum()
totals = cons["TOTAL BARRIO"]
cons = cons.drop("TOTAL BARRIO", axis=1)
cons = cons.drop(cons.columns[0:20], axis=1)
cons = cons.drop(cons.columns[160:224], axis=1)
males = cons[cons.columns[::2]]
females = cons[cons.columns[1::2]]
females = females.sum(axis=1)
males = males.sum(axis=1)
cons = cons.groupby((np.arange(len(cons.columns)) // 2) + 1, axis=1).sum()
cons['f'] = females
cons['m'] = males
cons['totals'] = totals
cons.columns = cons.columns.astype(str)
# save
sum_cons = cons.iloc[0:2, ].sum(axis=0)
sum_cons = sum_cons.rename("ABANDO", axis=1)
new_cons = cons.iloc[2:, ]
new_cons = new_cons.append(sum_cons)
cons = new_cons
cons.to_csv(joinpath(data_path, "cons.csv"), sep=',')
# ### Filter amenities
# In[ ]:
# amenities:
sustence = ["bar", "cafe", "biergarten", "cafe", "fast_food",
"food_court", "ice_cream", "pub", "restaurant"]
education = ["college", "driving_school", "kindergarten", "language_school",
"library", "toy_library", "music_school", "school", "university"]
financial = ["bank"]
healthcare = ["clinic", "dentist", "doctors", "hospital",
"nursing_home", "pharmacy", "social_facility", "veterinary"]
entertainment = ["arts_center", "casino", "cinema", "community_centre", "conference_centre",
"events_venue", "nightclub", "planetarium", "social_centre", "studio", "theatre"]
public_services = ["courthouse", "embassy", "fire_station", "police", "post_box",
"post_depot", "post_office", "prison", "ranger_station", "townhall"]
others = ["marketplace"]
sustence = {
"anemity": "",
"lat": "",
"long": ""
}
# read the osm file
osm = open(joinpath(data_path, "amenities_bilbao.osm"))
# TODO: extract location from
# ### Preprocess OD matrices
#
# - districts to be used: 
# In[ ]:
odm = pd.read_csv(joinpath(data_path, "ODM_bilbao1.csv"))
odm['From location'].unique()
odm['To location'].unique()
# districts
DEUSTO = ['D1 NORTE', 'D1 SUR']
URIBARRI = ["CASTANOS", "D2 ALTO", "RIA ESTE",
"RIA OESTE", "MB L3 (METRO BILBAO LINEA 3)"]
OTXARKOAGA_TXURDINAGA = ["OTXARKOAGA", "TXURDINAGA"]
BEGONA = ["D4 SUR"]
IBAIONDO = ["CASCO VIEJO", "D5 ESTE", "D5 OESTE"]
ABANDO = ['CENTRO', 'HOSPITAL', 'ACCESOS V (ACCESOS VIARIOS)', 'INTERMODAL']
ERREKALDE = ["D7 NORTE", "D7 SUR"]
BASURT_ZORROTZA = ["BASURTO", "ZORROZA"]
districts = ["ABANDO",
"BASURT_ZORROTZA",
"BEGONA",
"DEUSTO",
"ERREKALDE",
"IBAIONDO",
"OTXARKOAGA_TXURDINAGA",
"URIBARRI"]
for i in DEUSTO:
odm = odm.replace(to_replace=str(i), value="DEUSTO")
for i in URIBARRI:
odm = odm.replace(to_replace=str(i), value="URIBARRI")
for i in OTXARKOAGA_TXURDINAGA:
odm = odm.replace(to_replace=str(i), value="OTXARKOAGA_TXURDINAGA")
for i in BEGONA:
odm = odm.replace(to_replace=str(i), value="BEGONA")
for i in IBAIONDO:
odm = odm.replace(to_replace=str(i), value="IBAIONDO")
for i in ABANDO:
odm = odm.replace(to_replace=str(i), value="ABANDO")
for i in ERREKALDE:
odm = odm.replace(to_replace=str(i), value="ERREKALDE")
for i in BASURT_ZORROTZA:
odm = odm.replace(to_replace=str(i), value="BASURT_ZORROTZA")
odm.to_csv(joinpath(data_path, "ODM_bilbao_districtLevel.csv"))
# Merge rows by column name to reduce num of rows
odm = odm.groupby(['From location', 'To location']).sum()
odm.to_csv(joinpath(data_path, "ODM_bilbao_districtLevel_reduced.csv"))