Source code for ukbiobank.filtering.illness

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 25 13:33:29 2020

@author: jnecus

UKBiobank data filtering utilities
"""
import re

import pandas as pd

import ukbiobank


[docs]def healthy_unhealthy_split( ukbio=None, df=None, instances=[0, 1, 2, 3], return_filter_fields=False ): """ Splits dataframe into healthy_df and unhealthy_df based upon exclusion criteria used in Cole 2020 https://www.sciencedirect.com/science/article/pii/S0197458020301056 Exclusion critera were: An ICD-10 diagnosis (#41270), Self-reported long-standing illness disability or infirmity (UK Biobank data field #2188), Self-reported diabetes (field #2443) Stroke history (field #4056), Not having good or excellent self-reported health (field #2178). *Note: According to these criteria, around ~20% of the data are 'healthy', with 80% deemed 'unhealthy'* Parameters ---------- ukbio : ukbio object, mandatory df : pandas dataframe (generated using ukbio loadCsv). Mandatory instances : list of integers, Default is [0,1,2,3] (include all instances) return_filter_fields : Boolean, default False. If True, the fields used to filter according to healthy/unhealthy criteria are included in return dataframes (this can be useful for validation and see investigate the cause of healthy/unhealthy classification). If False, returned dataframes contain the same fields as the input dataframe. Returns ------- healthy_df, dataframe with individuals not matching exclusion criteria : Pandas dataframe unhealthy_df, dataframe with individuals containing one or more matching exclusion criteria : Pandas dataframe """ # Make copy of input dataframe in_df = df.copy() # Add fields required for exclusion criteria fields = [2188, 2443, 4056, 2178] df = ukbiobank.utils.addFields( ukbio=ukbio, df=df, fields=fields, instances=instances ) # ICD-10 diagnosis information is tied to instance '0', & added seperately fields = [41270] df = ukbiobank.utils.addFields(ukbio=ukbio, df=df, fields=fields, instances=0) # Gathering exclusion criteria columns and appending 'eid' excl_cols = list(df.columns.difference(in_df.columns)) excl_cols.append("eid") df_excl_criteria = df[excl_cols] unhealthy_eids = [] # List to collect eids of individuals identified as 'unhealthy' # Filtering those with ICD-10 diagnosis #41270 (collecting eid for individuals with not any 'non-nan', i.e. containing a diagnosis) col = list(df_excl_criteria.columns[df_excl_criteria.columns.str.match("^41270-")]) uh_eids = df_excl_criteria.loc[(df_excl_criteria[col].notna().any(axis=1)), "eid"] unhealthy_eids.extend(list(uh_eids)) # extending list of unhealthy eid's # Filtering those with self reported illness/disability #2188 responses_to_match = [ 1, -1, -3, ] # Will collecting eids with response '1' = 'yes', '-1' (do not know) and '-3' (prefer not to answer) col = list(df_excl_criteria.columns[df_excl_criteria.columns.str.match("^2188-")]) uh_eids = df_excl_criteria.loc[ (df_excl_criteria[col].isin(responses_to_match).all(axis=1)), "eid" ] unhealthy_eids.extend(list(uh_eids)) # Filtering those with self reported diabetes #2443 responses_to_match = [ 1, -1, -3, ] # Will collecting eids with response '1' = 'yes' to this field, '-1' (do not know) and '-3' (prefer not to answer) col = list(df_excl_criteria.columns[df_excl_criteria.columns.str.match("^2443-")]) uh_eids = df_excl_criteria.loc[ (df_excl_criteria[col].isin(responses_to_match).all(axis=1)), "eid" ] unhealthy_eids.extend(list(uh_eids)) # Filtering those with age stroke diagnosed #4056 (collecting eid for individuals with not any 'non-nan', i.e. containing an age of stroke diagnosis or a -1/-3 (do not know/prefer not to answer)) col = list(df_excl_criteria.columns[df_excl_criteria.columns.str.match("^4056-")]) uh_eids = df_excl_criteria.loc[(df_excl_criteria[col].notna().any(axis=1)), "eid"] unhealthy_eids.extend(list(uh_eids)) # extending list of unhealthy eid's # Filtering those not responding as having 'good' or 'excellent' health #2178 responses_to_match = [ -3, -1, 3, 4, ] # Collecting eids with response '-3,-1, 3, 4' = 'Do not know, no answer, fair, poor' (all but good or excellent) to this field col = list(df_excl_criteria.columns[df_excl_criteria.columns.str.match("^2178-")]) uh_eids = df_excl_criteria.loc[ (df_excl_criteria[col].isin(responses_to_match).all(axis=1)), "eid" ] unhealthy_eids.extend(list(uh_eids)) # Splitting original df by 'healthy' vs. 'unhealthy' eids unhealthy_eids = list(set(unhealthy_eids)) # unique unhealthy eids df_unhealthy = in_df.loc[in_df["eid"].isin(unhealthy_eids)] df_healthy = in_df.loc[~in_df["eid"].isin(unhealthy_eids)] # Including criteria fields if requested if return_filter_fields == True: df_unhealthy = pd.merge(df_unhealthy, df_excl_criteria, on="eid") df_healthy = pd.merge(df_healthy, df_excl_criteria, on="eid") return df_healthy, df_unhealthy