Teleportation and Location Data Analysis Notebook

Wrangling

Convert json data into dataframe per session and compute IPTD in process

In [222]:
from datetime import datetime
import json
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
from beakerx import *
import time

def fix_vector(x):
    # print(x)
    if type(x) == dict:
        # print("type(x) was dict")
        return np.array(list(x.values()), dtype=float)
    elif type(x) == list:
        print("type(x) was list")
        return np.array(x.values(), dtype=float)
    # else:
    #     print("type(x) was " + str(type(x)))

def parse_teleport_df(user_id,verbose = False):
    p1 = str(user_id)
    p2 = str(user_id+1)
    if verbose:
        print("Getting data for " + p1 + " and " + p2)
    tf = ['U/X.json','U/X.json']
    tf[0] = tf[0].replace("X",p1)
    tf[1] = tf[1].replace("X",p2)
    sdf = pd.DataFrame() #creates a new dataframe that's empty
    if verbose:
        print("From " + tf[0] + " and " + tf[1])
    for file in tf:
        with open(file, 'r') as f:
             data = json.load(f)

        df = json_normalize(data,record_path=['Events'])
        # print(df.columns)

        # Fix position data, convert from dict to numpy array
        df.endPosition.fillna(value=np.nan, inplace=True)
        df.startPosition = df.startPosition.apply(lambda x: fix_vector(x))        
        df.endPosition = df.endPosition.apply(lambda x: fix_vector(x))

        # 2022, Remove mangled cols, think this is from updating python/packages, probs json_normalize???
        valuelist = ['startPosition','endPosition']
        df = df.drop(columns=valuelist)
        # Rename values columns
        df = df.rename(columns={"startPosition.value": "startPosition", "endPosition.value": "endPosition"})

        # Fix Time
        df.dateTime = df.dateTime.apply(lambda x: x.replace("T","-"))
        df.dateTime = df.dateTime.apply(lambda x: x.replace(":"," "))
        df.dateTime = df.dateTime.apply(lambda x: x.replace("."," "))
        df.dateTime = df.dateTime.apply(lambda x: x.replace("-"," "))
        # Remove the last millisecond unit
        df.dateTime = df.dateTime.apply(lambda x: x[0:-1])
        df.dateTime = df.dateTime.apply(lambda x: np.datetime64(datetime.strptime(x, '%Y %m %d %H %M %S %f')))

        # Remove redundant cols
        valuelist = ['avgVelocity','duration','finalValue','hand','id','initialValue','owner']
        df = df.drop(columns=valuelist)

        # Parse Teleports
        valuelist = ['PlayerConnect','Teleport']
        df = df[df.logType.isin(valuelist)]
        df = df.reset_index()
        df = df.drop(columns='level_0')

        # print(df.values)

        if int(float(file[2:4])) < 10:
            df['uid'] = file[2]
        else:
            df['uid'] = file[2:4]

        df.at[1,'startPosition'] = np.array([0.0,0.0,0.0],dtype=float)
        for i in range( 2,len(df.index) ):
            df.at[i,'startPosition'] = df.endPosition.iloc[i-1]

        sdf = sdf.append(df, ignore_index = True) # ignoring index is optional

    sdf = sdf.sort_values('dateTime')
    sdf = sdf.reset_index()
    sdf = sdf.drop(columns='level_0')
    sdf = sdf.rename(columns={"index": "actionId"})
    # inter-personal teleport distance

    sdf['iptd'] = 0.0

    # lookup
    lup = {}
    lup[p1] = np.array([0.0,0.0,0.0],dtype=float)
    lup[p2] = np.array([0.0,0.0,0.0],dtype=float)

    for i in range( 2, len(sdf.actionId) ):
        # print(i)
        # Update last position of player who TELEPORTED
        uid = sdf.at[i,'uid']
        # 2022, numpy array conversion required.
        lup[uid] = np.array(sdf.at[i,'endPosition'],dtype=float)
        # print(lup[uid])
        # Compute distance from current end position to last position of other player
        sdf.at[i,'iptd'] = np.linalg.norm(lup[p1]-lup[p2])

    sdf['dt'] = 0
    # int(round(sdf.at[0,'dateTime'].timestamp() * 1e3))
    for i in range(1,len(sdf.actionId)):
        sdf.at[i,'dt'] = sdf.at[i,'dateTime'].timestamp() - sdf.at[0,'dateTime'].timestamp()
    
    return sdf

sdfs = []
for i in range(1,12,2):
    sdfs.append(parse_teleport_df(i))

Split position vectors into columns in each dataframe

In [223]:
def split_vector(vec):
    # print(type(vec))

    if isinstance(vec, float):
        return np.array_split(np.array([0,0,0], dtype=float),1)
    else:
        return np.array_split(vec, 1)

def apply_vector(vec):
    # print(type(vec))

    if isinstance(vec, float):
        return np.array([0,0,0], dtype=float)
    else:
        return np.array(vec, dtype=float)

def split_position_vectors(i):
    df = sdfs[i]
    df['startPosition'] = df['startPosition'].apply(lambda x: apply_vector(x))
    df['endPosition'] = df['endPosition'].apply(lambda x: apply_vector(x))
    # From: https://datascienceparichay.com/article/split-pandas-column-of-lists-into-multiple-columns/
    # new df from the column of lists
    split_df = pd.DataFrame(df['startPosition'].tolist(), columns=['sp_x', 'sp_y', 'sp_z'])
    # concat df and split_df
    df = pd.concat([df, split_df], axis=1)
    # new df from the column of lists
    split_df = pd.DataFrame(df['endPosition'].tolist(), columns=['ep_x', 'ep_y', 'ep_z'])
    # concat df and split_df
    sdfs[i] = pd.concat([df, split_df], axis=1)

def add_sid(i):
    sdfs[i] = sdfs[i].assign(sid=i+1)

for i in range(0,6,1):
    split_position_vectors(i)
    add_sid(i)
# display df
# df
sdfs[0].head()

# print(type(df.at[3,'startPosition']))
# df['spx'], df['spy'], df['spz'] =zip(*df['startPosition'].apply(lambda x: split_vector(x)))
Out[223]:
actionId dateTime logType length startPosition endPosition uid iptd dt sp_x sp_y sp_z ep_x ep_y ep_z sid
0 0 2019-08-21 14:09:56.963186 PlayerConnect NaN [0.0, 0.0, 0.0] [0.0, 0.0, 0.0] 1 0.000000 0 0.000000 0.0 0.000000 0.000000 0.0 0.000000 1
1 0 2019-08-21 14:10:31.829816 PlayerConnect NaN [0.0, 0.0, 0.0] [0.0, 0.0, 0.0] 2 0.000000 34 0.000000 0.0 0.000000 0.000000 0.0 0.000000 1
2 2 2019-08-21 14:11:23.222446 Teleport 11.409659 [0.0, 0.0, 0.0] [7.393284, 0.0, 8.896915] 2 11.567876 86 0.000000 0.0 0.000000 7.393284 0.0 8.896915 1
3 4 2019-08-21 14:11:48.175197 Teleport 7.346649 [7.393284, 0.0, 8.896915] [2.16486454, 0.0, 3.64042282] 2 4.235483 111 7.393284 0.0 8.896915 2.164865 0.0 3.640423 1
4 6 2019-08-21 14:12:34.663604 Teleport 15.574819 [2.16486454, 0.0, 3.64042282] [1.39462256, 0.0, -12.0114765] 2 12.092169 157 2.164865 0.0 3.640423 1.394623 0.0 -12.011477 1

Wrangle dataframe array into single dataframe

In [224]:
df = sdfs[0]
for i in range(1,6,1):
    df = df.append(sdfs[i], ignore_index = False) # ignoring index is optional

df = df.reset_index()
df = df.rename(columns={"index": "session_index"})
# Make sure these cols are numbers not strings.
df['uid'] = df['uid'].astype(int)
df['sid'] = df['sid'].astype(int)
df['actionId'] = df['actionId'].astype(int)
df
Out[224]:
session_index actionId dateTime logType length startPosition endPosition uid iptd dt sp_x sp_y sp_z ep_x ep_y ep_z sid
0 0 0 2019-08-21 14:09:56.963186 PlayerConnect NaN [0.0, 0.0, 0.0] [0.0, 0.0, 0.0] 1 0.000000 0 0.000000 0.0 0.000000 0.000000 0.0 0.000000 1
1 1 0 2019-08-21 14:10:31.829816 PlayerConnect NaN [0.0, 0.0, 0.0] [0.0, 0.0, 0.0] 2 0.000000 34 0.000000 0.0 0.000000 0.000000 0.0 0.000000 1
2 2 2 2019-08-21 14:11:23.222446 Teleport 11.409659 [0.0, 0.0, 0.0] [7.393284, 0.0, 8.896915] 2 11.567876 86 0.000000 0.0 0.000000 7.393284 0.0 8.896915 1
3 3 4 2019-08-21 14:11:48.175197 Teleport 7.346649 [7.393284, 0.0, 8.896915] [2.16486454, 0.0, 3.64042282] 2 4.235483 111 7.393284 0.0 8.896915 2.164865 0.0 3.640423 1
4 4 6 2019-08-21 14:12:34.663604 Teleport 15.574819 [2.16486454, 0.0, 3.64042282] [1.39462256, 0.0, -12.0114765] 2 12.092169 157 2.164865 0.0 3.640423 1.394623 0.0 -12.011477 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
550 31 108 2019-09-05 13:07:02.038217 Teleport 8.610597 [3.716706, 0.0, -6.205044] [2.85037, 0.0, 2.65972185] 12 3.739220 790 3.716706 0.0 -6.205044 2.850370 0.0 2.659722 6
551 32 144 2019-09-05 13:07:52.584193 Teleport 6.083108 [2.52882743, 0.0, 6.3850913] [0.3820052, 0.0, 0.2878809] 11 3.423223 841 2.528827 0.0 6.385091 0.382005 0.0 0.287881 6
552 33 121 2019-09-05 13:08:36.169676 Teleport 2.650542 [2.85037, 0.0, 2.65972185] [0.3415835, 0.0, 1.35904276] 12 1.071924 884 2.850370 0.0 2.659722 0.341583 0.0 1.359043 6
553 34 167 2019-09-05 13:10:43.515260 Teleport 22.687818 [0.3820052, 0.0, 0.2878809] [16.158741, 0.0, 16.5745926] 11 21.947561 1012 0.382005 0.0 0.287881 16.158741 0.0 16.574593 6
554 35 175 2019-09-05 13:11:17.537750 Teleport 20.394730 [16.158741, 0.0, 16.5745926] [1.67523384, 0.0, 2.03869152] 11 1.496845 1046 16.158741 0.0 16.574593 1.675234 0.0 2.038692 6

555 rows × 17 columns

In [225]:
import altair as alt
# Only use teleport data for graphs.
source = df[df['logType']=='Teleport']

Distribution of locations

In [226]:
base = alt.Chart(source)

xscale = alt.Scale(domain=(-25.0, 25.0))
yscale = alt.Scale(domain=(-25.0, 25.0))

bar_args = {'opacity': .3, 'binSpacing': 0}

points = base.mark_point().encode(
    alt.X('ep_x:Q', scale=xscale),
    alt.Y('ep_z:Q', scale=yscale),
    color='uid:N',
    shape='sid:N'
)

top_hist = base.mark_bar(**bar_args).encode(
    alt.X('ep_x:Q',
          # when using bins, the axis scale is set through
          # the bin extent, so we do not specify the scale here
          # (which would be ignored anyway)
          bin=alt.Bin(maxbins=20, extent=xscale.domain),
          stack=None,
          title=''
         ),
    alt.Y('count()', stack=None, title=''),
    # alt.Color('uid:N'),
).properties(height=60)

right_hist = base.mark_bar(**bar_args).encode(
    alt.Y('ep_x:Q',
          bin=alt.Bin(maxbins=20, extent=yscale.domain),
          stack=None,
          title='',
         ),
    alt.X('count()', stack=None, title=''),
    # alt.Color('uid:N'),
).properties(width=60)

top_hist & (points | right_hist)
Out[226]:

Session-user Teleport Location Plots

In [227]:
alt.Chart(source).mark_line(interpolate='step-after').encode(
    x='dt:Q',
    y='iptd:Q',
    color='uid:N'
).properties(
    width=180,
    height=180
).facet(
    column='sid:N'
)
Out[227]:

Teleport-time overlaps

Plot shows whether any temporal patterns occur across groups, however individual users could create hotspots by just teleporting lots in the same area

In [228]:
alt.Chart(source).mark_rect().encode(
    alt.X('dt:Q', bin=alt.Bin(maxbins=40)),
    alt.Y('iptd:Q', bin=alt.Bin(maxbins=40)),
    alt.Color('count():Q', scale=alt.Scale(scheme='greenblue'))
)
Out[228]:

Session teleport locations in sequence.

Positions over time, but too much data in each graph.

In [229]:
alt.Chart(source).mark_line(point=True).encode(
    alt.X('ep_x', scale=alt.Scale(zero=True)),
    alt.Y('ep_z', scale=alt.Scale(zero=True)),
    order='dt',
    color='uid:N'
).properties(
    width=180,
    height=180
).facet(
    column='sid:N'
)
Out[229]:

Session 3 Teleport Data

All teleport positions for session 3

In [230]:
alt.Chart(source[source.sid == 3]).mark_line(point=True).encode(
    alt.X('ep_x', scale=alt.Scale(zero=True)),
    alt.Y('ep_z', scale=alt.Scale(zero=True)),
    order='dt',
    # color='uid'
).properties(
    width=180,
    height=180
).facet(
    column='uid:N'
)
Out[230]:

Positions of teleports for S3 assistive action example

In [231]:
alt.Chart(source[(source.sid == 3) & (source.dt > 1100) & (source.dt < 1300)]).mark_line(point=True).encode(
    alt.X('ep_x', scale=alt.Scale(zero=True,domain=(-10,10))),
    alt.Y('ep_z', scale=alt.Scale(zero=True,domain=(-10,10))),
    order='dt',
    tooltip=['hoursminutes(dateTime):T', 'actionId','dt','ep_x','ep_z'],
    text=alt.Text('dt', format='.1f')
    # color='uid'
).properties(
    width=180,
    height=180
).facet(
    column='uid:N'
)
Out[231]:

Teleport Frequency Info

In [232]:
grp = df[df['logType']=='Teleport']
grp['uid'] = grp['uid'].astype(int)
grp = grp.rename(columns = {
    'logType':'Teleport'
})

grp = grp.groupby(['uid']).agg({'Teleport': 'count'})
grp = grp.reset_index()
grp['tfm'] = 0.0
# Vectorised way of doing this if...
# for row in grp.itertuples():
#     if row.uid < 11:
#         row.tfm = row.Teleport / 30
#     else:
#         row.tfm = row.Teleport / 18
mask = (grp['uid'] < 11)
grp['tfm'] = np.where(mask, grp.Teleport/30, grp.Teleport/18)
grp
/Users/thomasdeacon/opt/anaconda3/envs/invoke_xdf/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Out[232]:
uid Teleport tfm
0 1 7 0.233333
1 2 44 1.466667
2 3 21 0.700000
3 4 35 1.166667
4 5 25 0.833333
5 6 29 0.966667
6 7 41 1.366667
7 8 38 1.266667
8 9 185 6.166667
9 10 84 2.800000
10 11 11 0.611111
11 12 23 1.277778
In [233]:
alt.Chart(grp).mark_bar().encode(
    x=alt.X('uid:O',  axis=alt.Axis(title='User ID',labelAngle=0)),
    y=alt.Y('tfm:Q', axis=alt.Axis(title='Teleports per minute'))
)
Out[233]: