Source code for h0rton.tdlmc_utils.tdlmc_parser

import os
import numpy as np
import pandas as pd
from ast import literal_eval
import re
import h0rton.tdlmc_data
from h0rton.tdlmc_utils import tdlmc_metrics

__all__ = ['convert_to_dataframe', 'parse_closed_box', 'parse_open_box', 
'read_from_csv', 'format_results_for_tdlmc_metrics']

tdlmc_data_path = os.path.abspath(list(h0rton.tdlmc_data.__path__)[0])
"""str: directory path containing the TDLMC data

"""

[docs]def format_results_for_tdlmc_metrics(version_dir, out_dir, rung_id=2):
    """Format the BNN inference results so they can be read into the script that 
    generates the TDLMC metrics cornerplot

    Parameters
    ----------
    version_dir : str or os.path object
        path to the folder containing inference results
    rung_id : int
        TDLMC rung ID

    """
    label_to_id = {'A1 (0.5 HST orbit)': (4, ''), 
                   'A2 (1 HST orbit)': (3, ''), 
                   'A3 (2 HST orbits)': (2, ''), 
                   'B1 (89 doubles for 1 HST orbit)': (3, '_doubles'), 
                   'B2 (89 quads for 1 HST orbit)': (3, '_quads')}

    for label, (version_id, img) in label_to_id.items():
        summary = pd.read_csv(os.path.join(version_dir, 'summary.csv'), 
                              index_col=None)
        true_H0 = 70.0
        outside_rung = summary[ summary['id'] > (199)].index
        summary.drop(list(outside_rung), inplace=True)
        summary['keep'] = True # keep all lenses
        summary.loc[~summary['keep'], ['H0_mean', 'H0_std']] = -99
        summary['id'] = summary['id'].astype(int)
        
        if img != '':
            summary['is_quad'] = (summary['n_img'] == 4)
            n_test = np.min([len(summary[~summary['is_quad']]), 
                            len(summary[summary['is_quad']])])
            if img == '_doubles':
                summary = summary[~summary['is_quad']].iloc[:n_test]
            else:
                summary = summary[summary['is_quad']].iloc[:n_test]
                
        tdlmc_mean = summary['H0_mean'][summary['keep']]
        tdlmc_std = summary['H0_std'][summary['keep']]

        # Compute per-lens versions of the metrics
        summary['g'] = ((summary['H0_mean'] - true_H0)/summary['H0_std'])**2.0
        summary['log_g'] = np.log10(summary['g'])
        summary['p'] = (summary['H0_std']/true_H0)
        summary['a'] = (summary['H0_mean'] - true_H0)/true_H0

        # Test-set-side metrics
        G = tdlmc_metrics.get_goodness(tdlmc_mean,tdlmc_std, true_H0)
        P = tdlmc_metrics.get_precision(tdlmc_std, true_H0)
        A = tdlmc_metrics.get_accuracy(tdlmc_mean, true_H0)
        print("Goodness: ", G, "Log goodness: ", np.log10(G))
        print("Precision: ", P)
        print("Accuracy: ", A)
        print("Total combined", summary[summary['keep']].shape[0])
        print("Actually discarded", summary[~summary['keep']].shape[0])
        lens_name_formatting = lambda x: 'rung{:d}_seed{:d}'.format(rung_id, x)
        summary['rung_id'] = summary.id.apply(lens_name_formatting)
        summary = summary[['rung_id', 'H0_mean', 'H0_std']]
        summary.to_csv(os.path.join(out_dir, 'H0rton/{:s}.txt'.format(label)), 
                       header=None, index=None, sep=' ', mode='a')

[docs]def read_from_csv(csv_path):
    """Read a Pandas Dataframe from the combined csv file of TDLMC data while 
    evaluating all the relevant strings in each column as Python objects

    Parameters
    ----------
    csv_path : str
        path to the csv file generated using `convert_to_dataframe`

    Returns
    -------
    Pandas DataFrame
        the TDLMC data with correct Python objects

    """
    df = pd.read_csv(csv_path, index_col=False)
    # These are columns that are lists
    for list_col in [
                    'host_pos', 
                    'measured_td', 
                    'measured_td_err',
                    'agn_img_pos_x', 
                    'agn_img_pos_y', 
                    'agn_img_amp', 
                    'time_delays',
                    ]:
        df[list_col] = df[list_col].apply(literal_eval).apply(np.array)
    return df

[docs]def convert_to_dataframe(rung, save_csv_path):
    """Store the TDLMC closed and open boxes into a Pandas DataFrame and exports 
    to a csv file at the same location

    Parameters
    ----------
    rung : int
        rung number
    save_csv_path : str
        path of the csv file to be generated

    Returns
    -------
    Pandas DataFrame
        the extracted rung data

    """
    if save_csv_path is None:
        save_csv_path = os.path.join(tdlmc_data_path, 'rung{:d}_combined.csv'.format(rung))
        print("Saving rung {:d} data at {:s}...".format(rung, save_csv_path))

    df = pd.DataFrame()
    for code in ['code1', 'code2']:
        closed_code_dir = os.path.join(tdlmc_data_path, 'rung{:d}'.format(rung), code)
        open_code_dir = os.path.join(tdlmc_data_path, 'rung{:d}_open_box'.format(rung), code)
        seeds = sorted(os.listdir(closed_code_dir)) # list of seeds, e.g. 'f160w-seed101'
        row = {} # initialized dict in which to save lens info
        for seed in seeds:
            # Path to the text files
            closed_box_path = os.path.join(closed_code_dir, seed, 'lens_info_for_Good_team.txt')
            open_box_path = os.path.join(open_code_dir, seed, 'lens_all_info.txt')
            # Save seed path for easy access
            row['name'] = 'rung{:d}_{:s}_{:s}'.format(rung, code, seed)
            row['seed'] = seed
            row['seed_path'] = os.path.join(closed_code_dir, seed)
            # Parse the text files
            row = parse_closed_box(closed_box_path, row)
            row = parse_open_box(open_box_path, row)
            df = df.append(row, ignore_index=True)
    # Unravel nested dictionaries in some columns
    lens_mass = df['lens_mass'].apply(pd.Series).copy().add_prefix('lens_mass_')
    lens_light = df['lens_light'].apply(pd.Series).copy().add_prefix('lens_light_')
    ext_shear_bphi = df['ext_shear_bphi'].apply(pd.Series).copy().add_prefix('ext_shear_')
    ext_shear_e1e2 =  df['ext_shear_e1e2'].apply(pd.Series).copy().add_prefix('ext_shear_')
    df = pd.concat([df.drop(['lens_mass', 'lens_light', 'ext_shear_bphi', 'ext_shear_e1e2'], axis=1), lens_mass, lens_light, ext_shear_bphi, ext_shear_e1e2], axis=1)

    # Manually add abcd_ordering_i
    df = df.sort_values('seed', axis=0)
    if rung == 1:
        df['abcd_ordering_i'] = np.array([[0, 1, 2, 3], #101
                                         [0, 1, 2, 3], #102
                                         [0, 1, 2, 3], #103
                                         [0, 1], #104
                                         [0, 1], #105
                                         [0, 1, 2, 3], #107
                                         [1, 0, 3, 2], #108
                                         [1, 2, 0, 3], #109
                                         [1, 2, 3, 0], #110
                                         [3, 1, 0, 2], #111
                                         [2, 0, 1, 3], #113
                                         [1, 0], #114
                                         [1, 3, 2, 0], #115
                                         [1, 0], #116
                                         [3, 2, 0, 1], #117
                                         [3, 1, 0, 2], #118
                                         ])
        df['H0'] = 74.151
    elif rung == 2:
        df['abcd_ordering_i'] = np.array([[0, 1, 2, 3], #119
                                         [0, 1, 2, 3], #120
                                         [0, 1, 2, 3], #121
                                         [0, 1, 2, 3], #122
                                         [0, 1, 2, 3], #123
                                         [0, 2, 1, 3], #124
                                         [0, 1], #125
                                         [0, 1], #126
                                         [3, 0, 1, 2], #127
                                         [3, 2, 0, 1], #128
                                         [3, 0, 1, 2], #129
                                         [2, 1, 0, 3], #130
                                         [3, 0, 2, 1], #131
                                         [1, 3, 2, 0], #132
                                         [1, 0], #133
                                         [0, 1], #134
                                         ])
        df['H0'] = 66.643
    else:
        raise NotImplementedError
    
    df.to_csv(save_csv_path, index=None)
    return df

[docs]def parse_closed_box(closed_box_path, row_dict=dict()):
    """Parse the lines of an open-box TDLMX text file for Rungs 0, 1, and 2

    Parameters
    ----------
    closed_box_path : str
        path to the closed box text file, `lens_info_for_Good_team.txt.txt`
    row_dict : dict
        dictionary of the row info to update. Default: dict()

    Returns
    -------
    dict
        An updated dictionary containing the information in the closed box text file

    """
    file = open(closed_box_path)
    lines = [line.rstrip('\n') for line in file]

    row_dict['z_lens'], row_dict['z_src'] = literal_eval(lines[2].split('\t')[1])
    row_dict['measured_vel_disp'] = float(lines[5].split('\t')[1].split('km/s')[0])
    row_dict['measured_vel_disp_err'] = float(lines[5].split('\t')[1].split('km/s')[1].split(':')[1])
    row_dict['measured_td'] = literal_eval(re.split(r'\(|\)', lines[7])[1])
    row_dict['measured_td_err'] = literal_eval(re.split(r'\(|\)', lines[7])[3])
    return row_dict

[docs]def parse_open_box(open_box_path, row_dict=dict()):
    """Parse the lines of an open-box TDLMX text file for Rungs 0, 1, and 2

    Parameters
    ----------
    open_box_path : str
        path to the open box text file, `lens_all_info.txt`
    row_dict : dict
        dictionary of the row info to update. Default: dict()

    Returns
    -------
    dict
        An updated dictionary containing the information in the open box text file

    """
    file = open(open_box_path)
    lines = [line.rstrip('\n') for line in file]
    row_dict['H0'] = float(re.split(r':\s|km/s/Mpc', lines[3])[-2])
    row_dict['td_distance'] = float(re.split('ls:|Mpc', lines[5])[-2])
    row_dict['time_delays'] = literal_eval(re.split(r'\(|\)', lines[7])[1])
    row_dict['lens_mass'] = literal_eval(lines[11][7:])
    row_dict['ext_shear_e1e2'], row_dict['ext_shear_bphi'] = literal_eval(re.split(r'\(|\)', lines[12])[1])
    row_dict['lens_light'] = literal_eval(lines[14].split('\t')[1])
    row_dict['host_name'] = re.split(r'\(|\)|:|\t', lines[16])[2][1:]
    row_dict['host_pos'] = literal_eval(re.split(r'\(|\)|:|\t', lines[16])[-2])
    row_dict['host_mag'] = float(re.split(r'\t|\s', lines[17])[3])
    row_dict['host_r_eff'] = float(re.split(r'\t|\s', lines[17])[7])
    row_dict['agn_src_amp'] = float(lines[20].split()[-1])
    row_dict['agn_img_pos_x'] = literal_eval(re.split(r'\(|\)', lines[21])[1])
    row_dict['agn_img_pos_y'] = literal_eval(re.split(r'\(|\)', lines[21])[3])
    row_dict['agn_img_amp'] = literal_eval(re.split(r'\(|\)', lines[22])[1])
    row_dict['host_img_mag'] = re.split('plane: |mag|', lines[23])[3]
    row_dict['agn_img_mag'] = re.split('plane: |mag|', lines[23])[7]
    row_dict['vel_disp'] = float(re.split(r'km\/s| |\t', lines[25])[1])
    row_dict['kappa_ext'] = float(lines[27].split('\t')[1])

    return row_dict