# -*- coding: utf-8 -*-
"""
Created on Wed Mar 25 13:33:29 2020
@author: jnecus
UKBiobank data filtering utilities
"""
import ukbiobank
# import ukbiobank.utils
# from ukbiobank.utils import fieldNamesToIds, addFields
[docs]def filterInstancesArrays(ukbio=None, df=None, instances=None, arrays=None):
"""
Parameters
----------
ukbio : ukbio object, mandatory
df : pandas dataframe (generated using ukbio loadCsv)
instances : List of integers. Default is none (include all instances)
arrays : List of integers. Default is none (include all arrays)
Returns
-------
Dataframe with datafields filtered for selected instances/arrays : Pandas dataframe
"""
# If instances is not None and is not list, convert to list
if instances is not None and not isinstance(instances, list):
instances = [instances]
# if all columns contain alphanumeric characters then convert to field ID
if df.columns.str.contains("[a-z]").all():
df = ukbiobank.utils.fieldNamesToIds(ukbio, df)
field_instance_array_df_temp = ukbio.field_instance_array_df.copy()
# filtering by instances
if instances is not None:
field_instance_array_df_temp = field_instance_array_df_temp[
field_instance_array_df_temp["instance"].isin(instances)
]
# filtering by arrays
if arrays is not None:
field_instance_array_df_temp = field_instance_array_df_temp[
field_instance_array_df_temp["array"].isin(arrays)
]
# Finding intersection of "Dataframe" field_instance_arrays & "ALL" field_instance_arrays
cols = list(
set(df.columns.tolist())
& set(field_instance_array_df_temp["id_instance_array"].tolist())
)
if "eid" not in cols:
cols.append("eid")
return df[cols]
[docs]def filterByField(
ukbio=None, df=None, fields_to_include=None, instances=[0, 1, 2, 3], arrays=None
):
"""
Parameters
----------
ukbio : ukbio object, mandatory
df : pandas dataframe (currently only accepts FieldID headers as column headers)
fields_to_include: Dictionary whereby keys: 'fields to include', values:'values to include'
*FIELDS IN FIELDS_TO_INCLUDE MUST BE IN FIELD_ID FORM* e.g. '20002' (not 'Self-reported Illness') *
*VALUES IN FIELDS_TO_INCLUDE MUST BE IN CODED FORM* e.g. '1074', (not 'angina') *
instances : list of integers, Default is [0,1,2,3] (include all instances)
arrays : list of integers
Returns
-------
Pandas dataframe with data-fields filtered for selected fields, values, instances, arrays.
*This function uses 'OR' logic, i.e. if any of the values/fields included are present then they will be included*
"""
# Account for df = None, or if fields are not found in df, then add them
if df is None:
# Add all fields_to include
df = ukbiobank.utils.addFields(
ukbio=ukbio, fields=list(fields_to_include.keys())
)
else:
# Convert df headings to fieldid-instance.array
df = ukbiobank.utils.fieldNamesToIds(ukbio=ukbio, df=df)
# Checking for missing fields
df_fields = []
for f in df.columns.tolist():
df_fields.append(f.split("-")[0])
unique_df_fields = list(set(df_fields))
fields_to_add = []
for f in list(fields_to_include.keys()):
if f not in unique_df_fields:
fields_to_add.append(f)
if len(fields_to_add) > 0:
df = ukbiobank.utils.addFields(ukbio=ukbio, df=df, fields=fields_to_add)
# TODO (account for text/ids/mixture etc...)
# convert keys/values from text --> id
# Once all headers are raw Field IDs, and table values are encoded IDs..
"""Below here expected format is e.g. 'eid' '20002-1.0'
1437784 12633
"""
matched_eids = [] # list to collect eids for which a match is found
for field, value in fields_to_include.items():
for instance in instances:
field_instance = str(field) + "-" + str(instance)
# matching all columns with field/instance
field_instance_arrays = [
col for col in df if col.startswith(field_instance)
]
if len(field_instance_arrays) > 0:
field_instance_arrays.append("eid")
# Is there a matching value in any column given the list of values
if not isinstance(value, list):
value = [value]
temp_df = df[field_instance_arrays].isin(value)
# If any column is true, then keep that row (i.e. that 'eid' row)
temp_df = df[field_instance_arrays][temp_df.any(axis=1)]
matched_eids.extend(temp_df["eid"].tolist())
return df[df["eid"].isin(matched_eids)]