bilbao_preprocess.py

#!/usr/bin/env python
# coding: utf-8

# ### EUROSTAT data - sample data
#
# Description of EUROSTAT variables values:
#
# 1. (RB090)gender: 1- male, 2-female
# 2. (RB080)age
# 3. (PD060) Regularly participate in a leisure activity:
#                                                         1 Yes
#                                                         No - cannot afford it
#                                                         No - other reason
# 4. (PL031) Self-defined current economic status:
#                                                 1 Employee working full-time
#                                                 2 Employee working part-time
#                                                 3 Self-employed working full-time (including family worker)
#                                                 4 Self-employed working part-time (including family worker)
#                                                 5 Unemployed
#                                                 6 Pupil, student, further training, unpaid work experience
#                                                 7 In retirement or in early retirement or has given up business
#                                                 8 Permanently disabled or/and unfit to work
#                                                 9 In compulsory military or community service
#                                                 10 Fulfilling domestic tasks and care responsibilities
#                                                 11 Other inactive person
# 5. (PE040) Highest ISCED level attained:
#                                                 0 pre-primary education
#                                                 1 primary education
#                                                 2 lower secondary education
#                                                 3 (upper) secondary education
#                                                 4 post-secondary non tertiary education
#                                                 5 first stage of tertiary education (not leading directly to an
#                                                 advanced research qualification)
#                                                 6 second stage of tertiary

# In[ ]:


import pandas as pd
import numpy as np
from os.path import join as joinpath

data_path = joinpath("data", "bilbao", "before_simulation")
sample = joinpath(data_path)
household = "ES_2013h_EUSILC.csv"
personal = "ES_2013p_EUSILC.csv"
household_register = "ES_2013d_EUSILC.csv"
personal_register = "ES_2013r_EUSILC.csv"

# variables we need from files
d_var = ["DB030", "DB040", "DB090"]  # hhid, federal state, hh weight
# person's id, REGULARLY PARTICIPATE IN A LEISURE ACTIVITY, SELF-DEFINED CURRENT ECONOMIC STATUS,HIGHEST ISCED LEVEL ATTAINED
p_var = ["PB030", "PD060", "PL031", "PE040"]
# person's id, person's gender, weights, year of birth
r_var = ["RB030", "RB090", "RB050", "RB080"]

# read data and merge
h = pd.read_csv(joinpath(sample, household))
p = pd.read_csv(joinpath(sample, personal))[p_var]
d = pd.read_csv(joinpath(sample, household_register))[d_var]
r = pd.read_csv(joinpath(sample, personal_register))[r_var]

r["hhid"] = (np.floor(r["RB030"]/10)).astype(int)
r["RB080"] = 2021-r["RB080"]
r = r.merge(d, left_on="hhid", right_on="DB030")


#r = r.merge(p,left_on = "RB030", right_on="PB030")
ind = r[["RB090", "RB080"]]
ind = ind.rename(columns={"RB090": "gender", "RB080": "age"})
ind["gender"] = ind["gender"].replace({2: "f", 1: "m"})
ind.to_csv(joinpath(data_path, "ind.csv"), sep=",")


# ### Aggregated data
#
# Preprocess it to fit the IPF algorithm

# In[ ]:


cons = pd.read_csv(joinpath(data_path, "numero_habitantes_distrito_barrio_edad_2020.csv"), sep=';')
cons = cons.drop(cons.columns[[0,3,4]],1)

# save totals column

cons = cons[1:46].groupby(cons.columns[1])[cons.columns[2:]].sum()
totals = cons["TOTAL BARRIO"]


cons = cons.drop("TOTAL BARRIO", axis=1)
cons = cons.drop(cons.columns[0:20], axis=1)
cons = cons.drop(cons.columns[160:224], axis=1)


males = cons[cons.columns[::2]]
females = cons[cons.columns[1::2]]
females = females.sum(axis=1)
males = males.sum(axis=1)
cons = cons.groupby((np.arange(len(cons.columns)) // 2) + 1, axis=1).sum()
cons['f'] = females
cons['m'] = males
cons['totals'] = totals
cons.columns = cons.columns.astype(str)
# save
sum_cons = cons.iloc[0:2, ].sum(axis=0)
sum_cons = sum_cons.rename("ABANDO", axis=1)
new_cons = cons.iloc[2:, ]
new_cons = new_cons.append(sum_cons)
cons = new_cons
cons.to_csv(joinpath(data_path, "cons.csv"), sep=',')


# ### Filter amenities

# In[ ]:


# amenities:
sustence = ["bar", "cafe", "biergarten", "cafe", "fast_food",
            "food_court", "ice_cream", "pub", "restaurant"]
education = ["college", "driving_school", "kindergarten", "language_school",
             "library", "toy_library", "music_school", "school", "university"]
financial = ["bank"]
healthcare = ["clinic", "dentist", "doctors", "hospital",
              "nursing_home", "pharmacy", "social_facility", "veterinary"]
entertainment = ["arts_center", "casino", "cinema", "community_centre", "conference_centre",
                 "events_venue", "nightclub", "planetarium", "social_centre", "studio", "theatre"]
public_services = ["courthouse", "embassy", "fire_station", "police", "post_box",
                   "post_depot", "post_office", "prison", "ranger_station", "townhall"]
others = ["marketplace"]

sustence = {
    "anemity": "",
    "lat": "",
    "long": ""
}

# read the osm file
osm = open(joinpath(data_path, "amenities_bilbao.osm"))
# TODO: extract location from


# ### Preprocess OD matrices
#
# - districts to be used: ![image.png](attachment:image.png)

# In[ ]:


odm = pd.read_csv(joinpath(data_path, "ODM_bilbao1.csv"))
odm['From location'].unique()
odm['To location'].unique()

# districts
DEUSTO = ['D1 NORTE', 'D1 SUR']
URIBARRI = ["CASTANOS", "D2 ALTO", "RIA ESTE",
            "RIA OESTE", "MB L3 (METRO BILBAO LINEA 3)"]
OTXARKOAGA_TXURDINAGA = ["OTXARKOAGA", "TXURDINAGA"]
BEGONA = ["D4 SUR"]
IBAIONDO = ["CASCO VIEJO", "D5 ESTE", "D5 OESTE"]
ABANDO = ['CENTRO', 'HOSPITAL', 'ACCESOS V (ACCESOS VIARIOS)', 'INTERMODAL']
ERREKALDE = ["D7 NORTE", "D7 SUR"]
BASURT_ZORROTZA = ["BASURTO", "ZORROZA"]

districts = ["ABANDO",
             "BASURT_ZORROTZA",
             "BEGONA",
             "DEUSTO",
             "ERREKALDE",
             "IBAIONDO",
             "OTXARKOAGA_TXURDINAGA",
             "URIBARRI"]

for i in DEUSTO:
    odm = odm.replace(to_replace=str(i), value="DEUSTO")
for i in URIBARRI:
    odm = odm.replace(to_replace=str(i), value="URIBARRI")
for i in OTXARKOAGA_TXURDINAGA:
    odm = odm.replace(to_replace=str(i), value="OTXARKOAGA_TXURDINAGA")
for i in BEGONA:
    odm = odm.replace(to_replace=str(i), value="BEGONA")
for i in IBAIONDO:
    odm = odm.replace(to_replace=str(i), value="IBAIONDO")
for i in ABANDO:
    odm = odm.replace(to_replace=str(i), value="ABANDO")
for i in ERREKALDE:
    odm = odm.replace(to_replace=str(i), value="ERREKALDE")
for i in BASURT_ZORROTZA:
    odm = odm.replace(to_replace=str(i), value="BASURT_ZORROTZA")

odm.to_csv(joinpath(data_path, "ODM_bilbao_districtLevel.csv"))

# Merge rows by column name to reduce num of rows
odm = odm.groupby(['From location', 'To location']).sum()
odm.to_csv(joinpath(data_path, "ODM_bilbao_districtLevel_reduced.csv"))