Skip to content
Snippets Groups Projects
Select Git revision
  • master
1 result

bilbao_preprocess.py

Blame
  • bilbao_preprocess.py 7.21 KiB
    #!/usr/bin/env python
    # coding: utf-8
    
    # ### EUROSTAT data - sample data
    #
    # Description of EUROSTAT variables values:
    #
    # 1. (RB090)gender: 1- male, 2-female
    # 2. (RB080)age
    # 3. (PD060) Regularly participate in a leisure activity:
    #                                                         1 Yes
    #                                                         No - cannot afford it
    #                                                         No - other reason
    # 4. (PL031) Self-defined current economic status:
    #                                                 1 Employee working full-time
    #                                                 2 Employee working part-time
    #                                                 3 Self-employed working full-time (including family worker)
    #                                                 4 Self-employed working part-time (including family worker)
    #                                                 5 Unemployed
    #                                                 6 Pupil, student, further training, unpaid work experience
    #                                                 7 In retirement or in early retirement or has given up business
    #                                                 8 Permanently disabled or/and unfit to work
    #                                                 9 In compulsory military or community service
    #                                                 10 Fulfilling domestic tasks and care responsibilities
    #                                                 11 Other inactive person
    # 5. (PE040) Highest ISCED level attained:
    #                                                 0 pre-primary education
    #                                                 1 primary education
    #                                                 2 lower secondary education
    #                                                 3 (upper) secondary education
    #                                                 4 post-secondary non tertiary education
    #                                                 5 first stage of tertiary education (not leading directly to an
    #                                                 advanced research qualification)
    #                                                 6 second stage of tertiary
    
    # In[ ]:
    
    
    import pandas as pd
    import numpy as np
    from os.path import join as joinpath
    
    data_path = joinpath("data", "bilbao", "before_simulation")
    sample = joinpath(data_path)
    household = "ES_2013h_EUSILC.csv"
    personal = "ES_2013p_EUSILC.csv"
    household_register = "ES_2013d_EUSILC.csv"
    personal_register = "ES_2013r_EUSILC.csv"
    
    # variables we need from files
    d_var = ["DB030", "DB040", "DB090"]  # hhid, federal state, hh weight
    # person's id, REGULARLY PARTICIPATE IN A LEISURE ACTIVITY, SELF-DEFINED CURRENT ECONOMIC STATUS,HIGHEST ISCED LEVEL ATTAINED
    p_var = ["PB030", "PD060", "PL031", "PE040"]
    # person's id, person's gender, weights, year of birth
    r_var = ["RB030", "RB090", "RB050", "RB080"]
    
    # read data and merge
    h = pd.read_csv(joinpath(sample, household))
    p = pd.read_csv(joinpath(sample, personal))[p_var]
    d = pd.read_csv(joinpath(sample, household_register))[d_var]
    r = pd.read_csv(joinpath(sample, personal_register))[r_var]
    
    r["hhid"] = (np.floor(r["RB030"]/10)).astype(int)
    r["RB080"] = 2021-r["RB080"]
    r = r.merge(d, left_on="hhid", right_on="DB030")
    
    
    
    #r = r.merge(p,left_on = "RB030", right_on="PB030")
    ind = r[["RB090", "RB080"]]
    ind = ind.rename(columns={"RB090": "gender", "RB080": "age"})
    ind["gender"] = ind["gender"].replace({2: "f", 1: "m"})
    ind.to_csv(joinpath(data_path, "ind.csv"), sep=",")
    
    
    # ### Aggregated data
    #
    # Preprocess it to fit the IPF algorithm
    
    # In[ ]:
    
    
    cons = pd.read_csv(joinpath(data_path, "numero_habitantes_distrito_barrio_edad_2020.csv"), sep=';')
    cons = cons.drop(cons.columns[[0,3,4]],1)
    
    # save totals column
    
    cons = cons[1:46].groupby(cons.columns[1])[cons.columns[2:]].sum()
    totals = cons["TOTAL BARRIO"]
    
    
    cons = cons.drop("TOTAL BARRIO", axis=1)
    cons = cons.drop(cons.columns[0:20], axis=1)
    cons = cons.drop(cons.columns[160:224], axis=1)
    
    
    males = cons[cons.columns[::2]]
    females = cons[cons.columns[1::2]]
    females = females.sum(axis=1)
    males = males.sum(axis=1)
    cons = cons.groupby((np.arange(len(cons.columns)) // 2) + 1, axis=1).sum()
    cons['f'] = females
    cons['m'] = males
    cons['totals'] = totals
    cons.columns = cons.columns.astype(str)
    # save
    sum_cons = cons.iloc[0:2, ].sum(axis=0)
    sum_cons = sum_cons.rename("ABANDO", axis=1)
    new_cons = cons.iloc[2:, ]
    new_cons = new_cons.append(sum_cons)
    cons = new_cons
    cons.to_csv(joinpath(data_path, "cons.csv"), sep=',')
    
    
    # ### Filter amenities
    
    # In[ ]:
    
    
    # amenities:
    sustence = ["bar", "cafe", "biergarten", "cafe", "fast_food",
                "food_court", "ice_cream", "pub", "restaurant"]
    education = ["college", "driving_school", "kindergarten", "language_school",
                 "library", "toy_library", "music_school", "school", "university"]
    financial = ["bank"]
    healthcare = ["clinic", "dentist", "doctors", "hospital",
                  "nursing_home", "pharmacy", "social_facility", "veterinary"]
    entertainment = ["arts_center", "casino", "cinema", "community_centre", "conference_centre",
                     "events_venue", "nightclub", "planetarium", "social_centre", "studio", "theatre"]
    public_services = ["courthouse", "embassy", "fire_station", "police", "post_box",
                       "post_depot", "post_office", "prison", "ranger_station", "townhall"]
    others = ["marketplace"]
    
    sustence = {
        "anemity": "",
        "lat": "",
        "long": ""
    }
    
    # read the osm file
    osm = open(joinpath(data_path, "amenities_bilbao.osm"))
    # TODO: extract location from
    
    
    # ### Preprocess OD matrices
    #
    # - districts to be used: ![image.png](attachment:image.png)
    
    # In[ ]:
    
    
    odm = pd.read_csv(joinpath(data_path, "ODM_bilbao1.csv"))
    odm['From location'].unique()
    odm['To location'].unique()
    
    # districts
    DEUSTO = ['D1 NORTE', 'D1 SUR']
    URIBARRI = ["CASTANOS", "D2 ALTO", "RIA ESTE",
                "RIA OESTE", "MB L3 (METRO BILBAO LINEA 3)"]
    OTXARKOAGA_TXURDINAGA = ["OTXARKOAGA", "TXURDINAGA"]
    BEGONA = ["D4 SUR"]
    IBAIONDO = ["CASCO VIEJO", "D5 ESTE", "D5 OESTE"]
    ABANDO = ['CENTRO', 'HOSPITAL', 'ACCESOS V (ACCESOS VIARIOS)', 'INTERMODAL']
    ERREKALDE = ["D7 NORTE", "D7 SUR"]
    BASURT_ZORROTZA = ["BASURTO", "ZORROZA"]
    
    districts = ["ABANDO",
                 "BASURT_ZORROTZA",
                 "BEGONA",
                 "DEUSTO",
                 "ERREKALDE",
                 "IBAIONDO",
                 "OTXARKOAGA_TXURDINAGA",
                 "URIBARRI"]
    
    for i in DEUSTO:
        odm = odm.replace(to_replace=str(i), value="DEUSTO")
    for i in URIBARRI:
        odm = odm.replace(to_replace=str(i), value="URIBARRI")
    for i in OTXARKOAGA_TXURDINAGA:
        odm = odm.replace(to_replace=str(i), value="OTXARKOAGA_TXURDINAGA")
    for i in BEGONA:
        odm = odm.replace(to_replace=str(i), value="BEGONA")
    for i in IBAIONDO:
        odm = odm.replace(to_replace=str(i), value="IBAIONDO")
    for i in ABANDO:
        odm = odm.replace(to_replace=str(i), value="ABANDO")
    for i in ERREKALDE:
        odm = odm.replace(to_replace=str(i), value="ERREKALDE")
    for i in BASURT_ZORROTZA:
        odm = odm.replace(to_replace=str(i), value="BASURT_ZORROTZA")
    
    odm.to_csv(joinpath(data_path, "ODM_bilbao_districtLevel.csv"))
    
    # Merge rows by column name to reduce num of rows
    odm = odm.groupby(['From location', 'To location']).sum()
    odm.to_csv(joinpath(data_path, "ODM_bilbao_districtLevel_reduced.csv"))