Patching Dec 9, 2021 6-7a CST- All GitLab services may be unavailable for 5-10 minutes

Commit 796d28ff authored by Matthew Krafczyk's avatar Matthew Krafczyk
Browse files

re-organize, update code, add tests

parent cb99efe2
import pandas as pd
def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_function=None):
def contiguous_group_indices(df, sequence_index_level=None, sequence_col=None, sequence_function=None):
"""
Produce series of continuous group labels for a given sequence column and sequencing function
......@@ -26,7 +26,7 @@ def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_f
2018Q4 26
2019Q2 20
Passing index_level='Quarter' and a sequence function like yq_diff:
Passing sequence_index_level='Quarter' and a sequence function like yq_diff:
def yr(quarter):
return int(quarter[:4])
def mon(quarter):
......@@ -34,7 +34,7 @@ def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_f
def yq_diff(yq, yq_ref):
return ((yr(yq)*4+mon(yq))-((yr(yq_ref)*4)+mon(yq_ref)))
grp_ids = contiguous_group_indices(df, index_level='Quarter', sequence_function=yq_diff)
grp_ids = contiguous_group_indices(df, sequence_index_level='Quarter', sequence_function=yq_diff)
Yields grp_ids as:
......@@ -63,11 +63,11 @@ def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_f
# Use the sequence column if it's a Series
sequence_series = sequence_col
else:
sequence_series = df[sequence_col]
elif index_level is not None:
if index_level not in df.index.names:
raise ValueError(f"index_level {index_level} not in the data frame. available levels: {df.index.names}")
level_idx = df.index.names.index(index_level)
sequence_series = df.loc[:,sequence_col]
elif sequence_index_level is not None:
if sequence_index_level not in df.index.names:
raise ValueError(f"sequence_index_level {sequence_index_level} not in the data frame. available levels: {df.index.names}")
level_idx = df.index.names.index(sequence_index_level)
sequence_series = df.index.to_series().apply(lambda t: t[level_idx])
else:
# Get the dataframe index
......@@ -142,7 +142,10 @@ def sequence_plain_df(df, num_before, num_after, inc_val=True):
return DF
def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=True, index_level=None, sequence_col=None, sequence_function=None):
def sequence_df(df, lags,
group_index_level=None, group_col=None,
sequence_index_level=None, sequence_col=None,
sequence_function=None):
"""
Sequence feature data into multi-component rows.
......@@ -168,39 +171,106 @@ def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=T
Named Arguments
df: A Pandas dataframe containing a set of features for each day
beg_val: The first sequence value for which sequences are needed
end_val: The last sequence value for which sequences are needed
lags: A list of lags to include
num_before: The number of before the first predicted day needed for a prediction.
num_after: The number of days after the first predicted day
sequence_col: The column to use, if None will use the index.
sequence_function: A function to use to compute sequence differences. If None, it'll just take the difference.
This function should have behavior like this:
Keyword Arguments
inc_date: Whether to include data from the first predicted day
returns
A pandas dataframe containing rows of prediction and/or label data.
"""
# Compute Group ids
G_ids = contiguous_group_indices(df, index_level=index_level, sequence_col=sequence_col, sequence_function=sequence_function)
print("sequence_df start")
dfs = []
max_id = G_ids.max()
num_skipped = 0
for g_id in range(1,max_id+1):
# For each group, first, check how many rows there are.
num_in_group = (G_ids == g_id).sum()
if num_in_group >= num_before+num_after+(1 if inc_val else 0):
# This group has enough data.
dfs.append(sequence_plain_df(df[G_ids == g_id], num_before, num_after, inc_val))
if group_index_level is None and group_col is None:
# Compute Group ids
G_ids = contiguous_group_indices(df, sequence_index_level=sequence_index_level, sequence_col=sequence_col, sequence_function=sequence_function)
else:
if type(group_col) is pd.core.series.Series:
G_ids = group_col
elif group_index_level is not None:
if group_index_level in df.index.names:
g_idx = df.index.names.index(group_index_level)
elif type(group_index_level) is int:
g_idx = group_index_level
else:
raise ValueError(f"group_index_level of type {type(group_index_level)} not supported")
G_ids = df.index.to_series().apply(lambda i: i[g_idx])
else:
num_skipped += 1
print(f"Skipped {num_skipped} groups when building dataframe")
if len(dfs) == 0:
raise RuntimeError("There was no data! to combine!")
print(f"Combined {len(dfs)} groups")
return pd.concat(dfs, axis=0)
G_ids = df[group_col]
print("Contiguous Ids")
print(G_ids)
if sequence_col is not None and type(sequence_col) is not pd.core.series.Series:
# If sequence_col specifies a column of the dataFrame, we remove this column from the sequencing.
temp_df = df.loc[:,list(filter(lambda c: c != sequence_col, df.columns))]
elif group_col is not None and type(group_col) is not pd.core.series.Series:
# If group_col specifies a column of the dataFrame, we remove this column from the sequencing.
temp_df = df.loc[:,list(filter(lambda c: c != group_col, df.columns))]
else:
temp_df = df
print("temp_df:")
print(temp_df)
print(type(temp_df))
print(temp_df.dtypes)
# Change column types to support nans
integer_columns = temp_df.dtypes[temp_df.dtypes.apply(pd.api.types.is_integer_dtype)]
for col_name in integer_columns.index:
print(f"1: col_name: {col_name}")
temp_df.loc[:,col_name] = temp_df.loc[:,col_name].astype(pd.Int64Dtype())
print("temp_df after integer column change")
print(temp_df)
print(temp_df.dtypes)
print("group ids")
print(G_ids)
# Groupby
temp_gbydf = temp_df.groupby(G_ids)
dfs = []
print("After groupby")
print(type(temp_gbydf))
print(temp_gbydf.shift(0))
print(temp_gbydf.shift(1))
print(temp_gbydf.shift(2))
print("before for loop")
for lag in lags:
print(f"loop lag: {lag}")
slice_df = temp_gbydf.shift(-lag)
print(slice_df)
print(type(slice_df))
print("loop 2")
if slice_df is not None:
print("loop 3")
slice_df.columns = pd.MultiIndex.from_product([slice_df.columns,[lag]])
print("loop 4")
dfs.append(slice_df)
print("loop 5")
# Join segments into full dataframe.
DF = pd.concat(dfs, axis=1, join='outer').dropna()
# Restore the original types of the integer columns
print("DF")
print(DF)
print("columns")
for col in DF.columns:
print(col)
if col[0] in integer_columns.index:
DF[col] = DF[col].astype(integer_columns.loc[col[0]])
print("sequence_df end")
return DF
import pandas as pd
import pandas_sequence as pds
def test_basic_group_col_1_1():
# Build test DataFrame
df = pd.DataFrame(
[[1,1],
[1,2],
[1,3],
[1,4],
[2,1],
[2,2],
[2,3],
[3,1]],
columns=['Group', 'Value'],
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_basic_group_col_1_2():
# Build test DataFrame
df = pd.DataFrame(
[['A',1],
['A',2],
['A',3],
['A',4],
['B',1],
['B',2],
['B',3],
['C',1]],
columns=['Group', 'Value'],
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_basic_group_col_1_3():
# Build test DataFrame
df = pd.DataFrame(
[[1],
[2],
[3],
[4],
[1],
[2],
[3],
[1]],
columns=['Value'],
index=[0,1,2,3,4,5,6,7]
)
group_col = pd.Series([1,1,1,1,2,2,2,3])
sequenced_df = pds.sequence_df(df, [-1,0], group_col=group_col)
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_basic_1():
# Build test DataFrame
df = pd.DataFrame(
[[1],
[2],
[3],
[4],
[1],
[2],
[3],
[1]],
columns=['Value'],
index=[0,1,2,3,5,6,7,9]
)
sequenced_df = pds.sequence_df(df, [-1,0])
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,6,7])
assert sequenced_df.equals(target_df)
def test_basic_group_index_1():
# Build test DataFrame
df = pd.DataFrame(
[[1],
[2],
[3],
[4],
[1],
[2],
[3],
[1]],
columns=['Value'],
index=pd.MultiIndex.from_tuples([
('A', 0),
('A', 1),
('A', 2),
('A', 3),
('B', 4),
('B', 5),
('B', 6),
('C', 7)],
names=['Group', 'count'])
)
sequenced_df = pds.sequence_df(df, [-1,0], group_index_level='Group')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=pd.MultiIndex.from_tuples([
('A', 1),
('A', 2),
('A', 3),
('B', 5),
('B', 6)],
names=['Group', 'count']))
assert sequenced_df.equals(target_df)
def test_basic_sequence_index_1():
# Build test DataFrame
df = pd.DataFrame(
[[1],
[2],
[3],
[4],
[1],
[2],
[3],
[1]],
columns=['Value'],
index=pd.MultiIndex.from_tuples([
('A', 0),
('A', 1),
('A', 2),
('A', 3),
('B', 0),
('B', 1),
('B', 2),
('C', 0)],
names=['Group', 'count'])
)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_index_level='count')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=pd.MultiIndex.from_tuples([
('A', 1),
('A', 2),
('A', 3),
('B', 1),
('B', 2)],
names=['Group', 'count']))
assert sequenced_df.equals(target_df)
def test_basic_group_col_2_1():
# Build test DataFrame
df = pd.DataFrame(
[[1,1],
[1,2],
[1,3],
[1,4],
[2,5],
[2,6],
[2,7],
[3,8]],
columns=['Group', 'Value'],
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[5,6],
[6,7]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_basic_group_col_2_2():
# Build test DataFrame
df = pd.DataFrame(
[['A',1],
['A',2],
['A',3],
['A',4],
['B',5],
['B',6],
['B',7],
['C',8]],
columns=['Group', 'Value'],
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[5,6],
[6,7]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_basic_sequence_col_1_1():
# Build test DataFrame
df = pd.DataFrame(
[[1,1],
[2,2],
[3,3],
[4,4],
[1,1],
[2,2],
[3,3],
[1,1]],
columns=['Seq', 'Value'],
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_col='Seq')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_basic_sequence_col_1_2():
# Build test DataFrame
df = pd.DataFrame(
[['A',1],
['B',2],
['C',3],
['D',4],
['A',1],
['B',2],
['C',3],
['A',1]],
columns=['Seq', 'Value'],
index=[0,1,2,3,4,5,6,7]
)
def str_diff(s, ref_val):
return ord(s)-ord(ref_val)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_col='Seq', sequence_function=str_diff)
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_basic_sequence_col_1_3():
# Build test DataFrame
df = pd.DataFrame(
[[1],
[2],
[3],
[4],
[1],
[2],
[3],
[1]],
columns=['Value'],
index=[0,1,2,3,4,5,6,7]
)
sequence_col = pd.Series(['A', 'B', 'C', 'D', 'A', 'B', 'C', 'A'])
def str_diff(s, ref_val):
return ord(s)-ord(ref_val)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_col=sequence_col, sequence_function=str_diff)
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
def test_large_group_col_1_1():
# Build test DataFrame
df = pd.DataFrame(
[[1,1],
[1,2],
[1,3],
[1,4],
[2,1],
[2,2],
[2,3],
[3,1]],
columns=['Group', 'Value'],
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment