Sitz Blogz Sitz Blogz - 4 months ago 17
Python Question

How to include dynamic time?

I am trying to pull the logs with respect to time slots. The program below runs very fine when no. of hours are given and the logs in that range gets extracted.

But now I also what to include Start and end to be dynamically given. i.e. say between

8 am to 8pm
or
6am to 8am
and so on.

How do I get that? Any edit in the current program will also do or a separate program will also do.

Input: Mini Version of INPUT

Code:

import pandas as pd
from datetime import datetime,time
import numpy as np

fn = r'00_Dart.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']
df = pd.read_csv(fn, header=None, names=cols)

df['m'] = df.StopTime + df.StartTime
df['d'] = df.StopTime - df.StartTime

# 'start' and 'end' for the reporting DF: `r`
# which will contain equal intervals (1 hour in this case)
start = pd.to_datetime(df.StartTime.min(), unit='s').date()
end = pd.to_datetime(df.StopTime.max(), unit='s').date() + pd.Timedelta(days=1)

# building reporting DF: `r`
freq = '1H' # 1 Hour frequency
idx = pd.date_range(start, end, freq=freq)
r = pd.DataFrame(index=idx)
r['start'] = (r.index - pd.datetime(1970,1,1)).total_seconds().astype(np.int64)

# 1 hour in seconds, minus one second (so that we will not count it twice)
interval = 60*60 - 1

r['LogCount'] = 0
r['UniqueIDCount'] = 0

for i, row in r.iterrows():
# intervals overlap test
# https://en.wikipedia.org/wiki/Interval_tree#Overlap_test
# i've slightly simplified the calculations of m and d
# by getting rid of division by 2,
# because it can be done eliminating common terms
u = df[np.abs(df.m - 2*row.start - interval) < df.d + interval].UserID
r.ix[i, ['LogCount', 'UniqueIDCount']] = [len(u), u.nunique()]

r['Date'] = pd.to_datetime(r.start, unit='s').dt.date
r['Day'] = pd.to_datetime(r.start, unit='s').dt.weekday_name.str[:3]
r['StartTime'] = pd.to_datetime(r.start, unit='s').dt.time
r['EndTime'] = pd.to_datetime(r.start + interval + 1, unit='s').dt.time

#r.to_csv('results.csv', index=False)
#print(r[r.LogCount > 0])
#print (r['StartTime'], r['EndTime'], r['Day'], r['LogCount'], r['UniqueIDCount'])

rout = r[['Date', 'StartTime', 'EndTime', 'Day', 'LogCount', 'UniqueIDCount'] ]
#print rout
rout.to_csv('one_hour.csv', index=False, header=False)


Edit:

In Simple words, I should be able to give
StartTime
and
EndTIme
in the program. The code below is very much close to what I am trying to do. But how convert this to pandas.

from datetime import datetime,time

start = time(8,0,0)
end = time(20,0,0)

with open('USC28days_0_20', 'r') as infile, open('USC28days_0_20_time','w') as outfile:
for row in infile:
col = row.split()
t1 = datetime.fromtimestamp(float(col[2])).time()
t2 = datetime.fromtimestamp(float(col[3])).time()
print (t1 >= start and t2 <= end)


Edit Two: Working answer in Pandas

Taking a Part from the @MaxU's answer from selected answer. The below code strips the required group of logs between the given
StartTime
and
StopTime


import pandas as pd
from datetime import datetime,time
import numpy as np

fn = r'00_Dart.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']

df = pd.read_csv(fn, header=None, names=cols)

#df['m'] = df.StopTime + df.StartTime
#df['d'] = df.StopTime - df.StartTime

# filter input data set ...
start_hour = 8
end_hour = 9
df = df[(pd.to_datetime(df.StartTime, unit='s').dt.hour >= start_hour) & (pd.to_datetime(df.StopTime, unit='s').dt.hour <= end_hour)]

print df

df.to_csv('time_hour.csv', index=False, header=False)


But: If there was a possibility to have control on minutes and seconds also would be great solution.

At present this also strips the logs which have the hour of
StopTime
but also the minutes and seconds until the next hour.

Something like

start_hour = 8:0:0
end_hour = 9:0:0 - 1 # -1 to get the logs until 8:59:59


But this gives me an error

Answer

try this:

import pandas as pd
from datetime import datetime,time
import numpy as np

fn = r'D:\data\gDrive\data\.stack.overflow\2016-07\dart_small.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']

df = pd.read_csv(fn, header=None, names=cols)

df['m'] = df.StopTime + df.StartTime
df['d'] = df.StopTime - df.StartTime

# filter input data set ... 
start_hour = 8
end_hour = 20
df = df[(pd.to_datetime(df.StartTime, unit='s').dt.hour >= 8) & (pd.to_datetime(df.StartTime, unit='s').dt.hour <= 20)]


# 'start' and 'end' for the reporting DF: `r`
# which will contain equal intervals (1 hour in this case)
start = pd.to_datetime(df.StartTime.min(), unit='s').date()
end = pd.to_datetime(df.StopTime.max(), unit='s').date() + pd.Timedelta(days=1)

# building reporting DF: `r`
freq = '1H'  # 1 Hour frequency
idx = pd.date_range(start, end, freq=freq)
r = pd.DataFrame(index=idx)
r = r[(r.index.hour >= start_hour) & (r.index.hour <= end_hour)]
r['start'] = (r.index - pd.datetime(1970,1,1)).total_seconds().astype(np.int64)

# 1 hour in seconds, minus one second (so that we will not count it twice)
interval = 60*60 - 1

r['LogCount'] = 0
r['UniqueIDCount'] = 0

for i, row in r.iterrows():
        # intervals overlap test
        # https://en.wikipedia.org/wiki/Interval_tree#Overlap_test
        # i've slightly simplified the calculations of m and d
        # by getting rid of division by 2,
        # because it can be done eliminating common terms
    u = df[np.abs(df.m - 2*row.start - interval) < df.d + interval].UserID
    r.ix[i, ['LogCount', 'UniqueIDCount']] = [len(u), u.nunique()]

r['Date'] = pd.to_datetime(r.start, unit='s').dt.date
r['Day'] = pd.to_datetime(r.start, unit='s').dt.weekday_name.str[:3]
r['StartTime'] = pd.to_datetime(r.start, unit='s').dt.time
r['EndTime'] = pd.to_datetime(r.start + interval + 1, unit='s').dt.time

#r.to_csv('results.csv', index=False)
#print(r[r.LogCount > 0])
#print (r['StartTime'], r['EndTime'], r['Day'], r['LogCount'], r['UniqueIDCount'])

rout =  r[['Date', 'StartTime', 'EndTime', 'Day', 'LogCount', 'UniqueIDCount'] ]
#print rout

OLD answer:

from_time = '08:00'
to_time = '18:00'
rout.between_time(from_time, to_time).to_csv('one_hour.csv', index=False, header=False)
Comments