from pprint import pprint
import csv
import gzip
import pandas as pd

if __name__=="__main__":
    # From http://ec.europa.eu/eurostat/statistics-explained/index.php/Tutorial:Country_codes_and_protocol_order
    eu = ["BE", "BG", "CZ", "DK", "DE", "EE", "IE", "EL", "ES", "FR", "HR",
          "IT", "CY", "LV", "LT", "LU", "HU", "MT", "NL", "AT", "PL", "PT",
          "RO", "SI", "SK", "FI", "SE", "UK"]

    data = {country: {} for country in eu}

    le = csv.reader(gzip.open("demo_mlexpec.tsv.gz", "rt"), delimiter="\t")
    for row in le:
        unit, sex, age, geo = row[0].split(",")
        if sex == "T" and age == "Y1" and geo in eu:
            data[geo]["le"] = \
                "".join([c for c in row[3] if c.isdigit() or c == "."])

    # hle = csv.reader(gzip.open("hlth_hlye.tsv.gz", "rt"), delimiter="\t")
    # for row in hle:
    #     code, geo = row[0].split(",")
    #     #age = code.split("_")[1].strip()
    #     if code == "F_0_DFLE" and geo in eu:
    #         data[geo]["hle_f"] = \
    #             "".join([c for c in row[3] if c.isdigit() or c == "."])
    #     if code == "M_0_DFLE" and geo in eu:
    #         data[geo]["hle_m"] = \
    #             "".join([c for c in row[3] if c.isdigit() or c == "."])


    bme = csv.reader(gzip.open("hlth_ehis_bm1e.tsv.gz", "rt"), delimiter="\t")
    bme_header = next(bme)
    for row in bme:
        unit,bmi,isced11,sex,age,time = row[0].split(",")
        if bmi.strip() == "BMI_GE30" and time.strip() == "2014"\
           and sex.strip() == "T" and age.strip() == "TOTAL":
            #print(bmi, time)
            for i, field in enumerate(row):
                geo = bme_header[i].strip()
                if geo in eu:
                    data[geo]["obese"] = \
                        "".join([c for c in field if c.isdigit() or c == "."])

    sk = csv.reader(gzip.open("hlth_ehis_sk1e.tsv.gz", "rt"), delimiter="\t")
    sk_header = next(sk)
    for row in sk:
        unit,smoking,isced11,sex,age,time = row[0].split(",")
        if smoking.strip() == "SM_CUR" and time.strip() == "2014"\
           and sex.strip() == "T" and age.strip() == "Y_GE18":
            #print(bmi, time)
            for i, field in enumerate(row):
                geo = sk_header[i].strip()
                if geo in eu:
                    data[geo]["sk"] = \
                        "".join([c for c in field if c.isdigit() or c == "."])

    fv = csv.reader(gzip.open("hlth_ehis_fv3e.tsv.gz", "rt"), delimiter="\t")
    fv_header = next(fv)
    for row in fv:
        unit,n_portion,isced11,sex,age,time = row[0].split(",")
        if n_portion.strip() == "GE5" and time.strip() == "2014"\
           and sex.strip() == "T" and age.strip() == "TOTAL":
            #print(bmi, time)
            for i, field in enumerate(row):
                geo = fv_header[i].strip()
                if geo in eu:
                    data[geo]["fv"] = \
                        "".join([c for c in field if c.isdigit() or c == "."])


    ex = csv.reader(gzip.open("hlth_ehis_pe2e.tsv.gz", "rt"), delimiter="\t")
    ex_header = next(ex)
    for row in ex:
        unit,duration,isced11,sex,age,time = row[0].split(",")
        if duration.strip() == "MN_GE150" and time.strip() == "2014"\
           and sex.strip() == "T" and age.strip() == "TOTAL":
            #print(bmi, time)
            for i, field in enumerate(row):
                geo = ex_header[i].strip()
                if geo in eu:
                    data[geo]["ex"] = \
                        "".join([c for c in field if c.isdigit() or c == "."])

    # US life expectancy published from birth -- subtracted 1 so it's comparable
    # to EU 1YR expectancy.
    data["US"] = {"sk": 15.5, "obese": 37.7, "le": 77.7, "fv": 10, "ex": 51.7}
    df = pd.DataFrame(data).transpose()
    df.index.name = "country"
    df.to_csv("euro-health.csv")