Demographics data processing notebook

Packages and Constants

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
import math

PATH = os.getcwd() + "/data/demographics.csv"

Generic Functions

In [2]:
def get_column_from_csv(file, col_name):
    # Try to get the file and if it doesn't exist issue a warning
    try:
        df = pd.read_csv(file)
    except FileNotFoundError:
        print("File Doesn't Exist")
    else:
        return df[col_name]

def to_precision(x,p):
    """
    returns a string representation of x formatted with a precision of p

    Based on the webkit javascript implementation taken from here:
    https://code.google.com/p/webkit-mirror/source/browse/JavaScriptCore/kjs/number_object.cpp
    """

    x = float(x)

    if x == 0.:
        return "0." + "0"*(p-1)

    out = []

    if x < 0:
        out.append("-")
        x = -x

    e = int(math.log10(x))
    tens = math.pow(10, e - p + 1)
    n = math.floor(x/tens)

    if n < math.pow(10, p - 1):
        e = e -1
        tens = math.pow(10, e - p+1)
        n = math.floor(x / tens)

    if abs((n + 1.) * tens - x) <= abs(n * tens -x):
        n = n + 1

    if n >= math.pow(10,p):
        n = n / 10.
        e = e + 1

    m = "%.*g" % (p, n)

    if e < -2 or e >= p:
        out.append(m[0])
        if p > 1:
            out.append(".")
            out.extend(m[1:p])
        out.append('e')
        if e > 0:
            out.append("+")
        out.append(str(e))
    elif e == (p -1):
        out.append(m)
    elif e >= 0:
        out.append(m[:e+1])
        if e+1 < len(m):
            out.append(".")
            out.extend(m[e+1:])
    else:
        out.append("0.")
        out.extend(["0"]*-(e+1))
        out.append(m)

    return "".join(out)

Data loading and cleaning

Read file, as csv was saved on excel it has some issues of encoding, also actual header is for humans.

In [3]:
df = pd.read_csv(PATH, header=1,encoding='cp1252')
df.head()
Out[3]:
WC04_01 WC04_03 WC04_02 AF01_rgs SP03_01 SP03_02 SD02_01 PD04_01 MK01 MK01_01 ... PA02_09 PA02_10 PA02_11 PA02_12 PA02_13 PA02_14 PA02_15 PA02_16 PA02_17 PA02_18
0 1 1 allik 1 6 5 46 male 5 2 ... 4 4 1 3 1 3 4 1 4 4
1 2 2 lison 1 5 5 26 Female 4 2 ... 4 3 1 2 2 2 3 1 4 4
2 1 3 dafat 1 5 5 31 Male 4 1 ... 4 2 1 1 1 2 2 1 1 1
3 2 4 brnor 1 4 3 30 MALE 8 2 ... 4 4 1 3 1 2 3 1 2 3
4 1 5 rakim 1 5 5 36 male 1 2 ... 3 3 1 3 1 2 1 1 1 2

5 rows × 65 columns

Wrangle the useful stuff

In [4]:
df = df.rename(columns = {
    'WC04_01': 'PID',
    'WC04_03': 'UID',
    'WC04_02': 'codename',
    'SD02_01': 'age',
    'PD04_01': 'gender',
    'SP03_01': 'vr_exp_a',
    'SP03_02': 'vr_exp_b',
    'MK02_01': 'music_years',
    'MK04_01': 'computer_music_years'
})
value_list = ['AF01_rgs', 'OQ01_01', 'OQ02_01','OQ03_01','OQ04_01', 'OQ05_01','MK01',
              'MK01_01','MK01_02','MK01_03','MK01_04','MK01_05','MK01_06','MK01_08','MK01_08a', 'MK01_09',
              'MK01_07','MK01_07a','MK05_01','MK06','MK01_09a']
df = df.drop(value_list, axis = 1)
# Female = 0, Male = 1
df.gender = [1,0,1,1,1,1,1,1,1,1,1,1]
df
Out[4]:
PID UID codename vr_exp_a vr_exp_b age gender music_years computer_music_years MS01_01 ... PA02_09 PA02_10 PA02_11 PA02_12 PA02_13 PA02_14 PA02_15 PA02_16 PA02_17 PA02_18
0 1 1 allik 6 5 46 1 30 20 6 ... 4 4 1 3 1 3 4 1 4 4
1 2 2 lison 5 5 26 0 10 3 6 ... 4 3 1 2 2 2 3 1 4 4
2 1 3 dafat 5 5 31 1 16 12 7 ... 4 2 1 1 1 2 2 1 1 1
3 2 4 brnor 4 3 30 1 14 7 6 ... 4 4 1 3 1 2 3 1 2 3
4 1 5 rakim 5 5 36 1 20 1 5 ... 3 3 1 3 1 2 1 1 1 2
5 2 6 gipri 2 2 31 1 20 10 5 ... 1 1 1 1 1 1 1 1 1 1
6 2 8 toion 6 6 31 1 20 5 6 ... 1 2 1 2 1 2 2 1 1 1
7 1 7 roack 4 4 31 1 17 15 7 ... 4 4 3 3 3 3 3 3 3 3
8 2 10 aneme 6 6 23 1 0 0 5 ... 1 1 1 1 1 1 1 1 1 1
9 1 9 kaofi 6 6 40 1 20 20 6 ... 1 1 1 1 1 1 2 1 1 1
10 1 11 kyzie 4 4 35 1 20 20 7 ... 5 5 1 2 5 5 5 1 5 3
11 2 12 keoch 3 3 33 1 20 20 7 ... 5 5 1 2 5 5 5 1 5 5

12 rows × 44 columns

Statistics

Get basic demographics for paper

In [5]:
print('Age, Years M(SD): '+ to_precision(df.age.mean(),3) +'('+ to_precision(df.age.std(),3) +')')
print('VR Experience A, 1-7 M(SD): '+ to_precision(df.vr_exp_a.mean(),3) +'('+ to_precision(df.vr_exp_a.std(),3) +')')
print('VR Experience B, 1-7 M(SD): '+ to_precision(df.vr_exp_b.mean(),3) +'('+ to_precision(df.vr_exp_b.std(),3) +')')
print('Music Experience, Years M(SD): '+ to_precision(df.music_years.mean(),3) +'('+ to_precision(df.music_years.std(),3) +')')
print('Computer Music Experience, Years M(SD): '+ to_precision(df.computer_music_years.mean(),3) +'('+ to_precision(df.computer_music_years.std(),3) +')')
print('Genders: M=11, F=1' )
Age, Years M(SD): 32.7(6.06)
VR Experience A, 1-7 M(SD): 4.67(1.30)
VR Experience B, 1-7 M(SD): 4.50(1.31)
Music Experience, Years M(SD): 17.3(7.20)
Computer Music Experience, Years M(SD): 11.1(7.86)
Genders: M=11, F=1

Compute MSI

In [6]:
msi_df = df.loc[:, 'UID':'MS02_05' ]
value_list = ['codename', 'vr_exp_a', 'vr_exp_b','music_years','computer_music_years']
msi_df = msi_df.drop(value_list, axis = 1)

# Recode musical training q's
# 0; 1; 2; 3; 4-5; 6-9; 10 or more
# 1  2  3  4    5    6  7
def recode_msi_instrument(years):
    years = int(years)
    if years == 0:
        return 1
    elif years == 1:
        return 2
    elif years == 2:
        return 3
    elif years == 3:
        return  4
    elif 4 <= years <= 5:
        return 5
    elif 6 <= years <= 9:
        return 6
    elif years > 9:
        return 7
    else:
        return 0
assert recode_msi_instrument(-1) == 0
assert recode_msi_instrument(0) == 1
assert recode_msi_instrument(1) == 2
assert recode_msi_instrument(2) == 3
assert recode_msi_instrument(3) == 4
assert recode_msi_instrument(5) == 5
assert recode_msi_instrument(7) == 6
assert recode_msi_instrument(11) == 7

# 0 / 0.5 / 1 / 1.5 / 2 / 3-4 / 5 or more
# 1   2     3    4    5    6    7
def recode_msi_practise(hours):
    if hours == 0:
        return 1
    elif 0 < hours <= 0.5:
        return 2
    elif 0.5 < hours <= 1:
        return 3
    elif 1 < hours <= 1.5:
        return 4
    elif 1.5 < hours <= 2:
        return 5
    elif 2 < hours <= 4:
        return 6
    elif hours > 4:
        return 7
    else:
        return 0

assert recode_msi_practise(-1) == 0
assert recode_msi_practise(0) == 1
assert recode_msi_practise(0.5) == 2
assert recode_msi_practise(1) == 3
assert recode_msi_practise(1.5) == 4
assert recode_msi_practise(2) == 5
assert recode_msi_practise(3.5) == 6
assert recode_msi_practise(5.5) == 7

# 0 / 0.5 / 1 / 2 / 3 / 4-6 / 7 or more
# 1   2     3   4   5   6    7
def recode_msi_theory(years):
    if years == 0:
        return 1
    elif 0 < years <= 0.5:
        return 2
    elif 0.5 < years <= 1:
        return 3
    elif 1 < years <= 2:
        return 4
    elif 2 < years <= 3:
        return 5
    elif 3 < years <= 6:
        return 6
    elif years > 6:
        return 7
    else:
        return 0

assert recode_msi_theory(-1) == 0
assert recode_msi_theory(0) == 1
assert recode_msi_theory(0.5) == 2
assert recode_msi_theory(1) == 3
assert recode_msi_theory(1.5) == 4
assert recode_msi_theory(2) == 4
assert recode_msi_theory(3) == 5
assert recode_msi_theory(5.5) == 6
assert recode_msi_theory(7) == 7

# 0 / 0.5 / 1 / 2 / 3-5 / 6-9 / 10 or more
# 1   2     3   4   5     6     7
def recode_msi_training(years):
    if years == 0:
        return 1
    elif 0 < years <= 0.5:
        return 2
    elif 0.5 < years <= 1:
        return 3
    elif 1 < years <= 2:
        return 4
    elif 2 < years <= 5:
        return 5
    elif 5 < years <= 9:
        return 6
    elif years > 9:
        return 7
    else:
        return 0

assert recode_msi_training(-1) == 0
assert recode_msi_training(0) == 1
assert recode_msi_training(0.5) == 2
assert recode_msi_training(1) == 3
assert recode_msi_training(1.5) == 4
assert recode_msi_training(2) == 4
assert recode_msi_training(3) == 5
assert recode_msi_training(5.5) == 6
assert recode_msi_training(11) == 7

# 0 / 1 / 2 / 3 / 4 / 5 / 6 or more
# 1   2   3   4   5   6   7
def recode_msi_play(number):
    number = int(number)
    if number >= 6:
        return 7
    elif number <= -1:
        return 0
    else:
        return number + 1
assert recode_msi_play(-1) == 0
assert recode_msi_play(0) == 1
assert recode_msi_play(1) == 2
assert recode_msi_play(2) == 3
assert recode_msi_play(3) == 4
assert recode_msi_play(5) == 6
assert recode_msi_play(7) == 7
assert recode_msi_play(11) == 7

msi_df.MS02_01 = msi_df.MS02_01.apply(recode_msi_instrument)
msi_df.MS02_02 = msi_df.MS02_02.apply(recode_msi_practise)
msi_df.MS02_03 = msi_df.MS02_03.apply(recode_msi_theory)
msi_df.MS02_04 = msi_df.MS02_04.apply(recode_msi_training)
msi_df.MS02_05 = msi_df.MS02_05.apply(recode_msi_play)

# Sum items in factors and normalise to 0-1 value
# Perceptual acuity
msi_df['MsiPa'] = (msi_df.MS01_01 + msi_df.MS01_02 + ((msi_df.MS01_03*-1)+8) + msi_df.MS01_04 + ((msi_df.MS01_05*-1)+8)  + msi_df.MS01_07 + msi_df.MS01_08 + ((msi_df.MS01_09*-1)+8) + msi_df.MS01_10 )/63

#  Musical training
msi_df['MsiMt'] = ( ( (msi_df.MS01_06*-1)+8) + ((msi_df.MS01_11*-1)+8) + msi_df.MS02_01 + msi_df.MS02_02 + msi_df.MS02_03 + msi_df.MS02_04 + msi_df.MS02_05)/49

msi_df = msi_df.loc[:, ['UID','age','gender','MsiPa','MsiMt']]
msi_df
Out[6]:
UID age gender MsiPa MsiMt
0 1 46 1 0.857143 0.897959
1 2 26 0 0.793651 0.857143
2 3 31 1 0.841270 0.836735
3 4 30 1 0.873016 0.918367
4 5 36 1 0.650794 0.673469
5 6 31 1 0.825397 0.877551
6 8 31 1 0.809524 0.897959
7 7 31 1 1.000000 0.795918
8 10 23 1 0.666667 0.183673
9 9 40 1 0.841270 0.693878
10 11 35 1 1.000000 0.836735
11 12 33 1 0.920635 0.775510
In [7]:
base = alt.Chart(msi_df)

points = base.mark_point().encode(
    alt.X('MsiPa:Q', title="MSI Perceptual Acuity"),
    alt.Y('MsiMt:Q', title="MSI Musical Training"),
    color='UID:N',
)

points
Out[7]:
In [8]:
pam_df = df.loc[:, 'UID':'gender' ]
scores = df.loc[:, 'PA02_01':'PA02_18']
# for positive items, recode strongly disagree to 0, disagree to 1, neutral to 2, agree to 3, and strongly agree to 4.
scores.iloc[:,[0,1,2,3,5,6,8,9,10,12,13,14,15,16,17]] = scores.iloc[:,[0,1,2,3,5,6,8,9,10,12,13,14,15,16,17]] - 1.0
# for negative items, recode strongly disagree to 4, disagree to 3, neutral to 2, agree to 1, and strongly agree to 0.
scores.iloc[:,[4,7,11]] = scores.iloc[:,[4,7,11]] * -1 + 5
pam_df['pam'] = scores.iloc[:,0:17].sum(axis=1) / 72

value_list = ['codename', 'vr_exp_a', 'vr_exp_b']
pam_df = pam_df.drop(value_list, axis = 1)
pam_df['SID'] = [1,1,2,2,3,3,4,4,5,5,6,6]
pam_df
Out[8]:
UID age gender pam SID
0 1 46 1 0.402778 1
1 2 26 0 0.430556 1
2 3 31 1 0.291667 2
3 4 30 1 0.291667 2
4 5 36 1 0.263889 3
5 6 31 1 0.166667 3
6 8 31 1 0.263889 4
7 7 31 1 0.500000 4
8 10 23 1 0.166667 5
9 9 40 1 0.222222 5
10 11 35 1 0.736111 6
11 12 33 1 0.763889 6
In [9]:
base = alt.Chart(pam_df)

points = base.mark_bar().encode(
    alt.X('UID:N', title="User ID"),
    alt.Y('pam:Q', title="Personal Acquaintance"),
    color='SID:N'
)

points
Out[9]:
In [ ]: