Patching Dec 9, 2021 6-7a CST- All GitLab services may be unavailable for 5-10 minutes

Commit c00423df authored by Matthew Krafczyk's avatar Matthew Krafczyk
Browse files

Change fundamental functioning of sequence_df. Add multitude of tests

parent 796d28ff
......@@ -100,52 +100,7 @@ def contiguous_group_indices(df, sequence_index_level=None, sequence_col=None, s
return grp_ids
def sequence_plain_df(df, num_before, num_after, inc_val=True):
"""
Sequence feature data into multi-component rows.
This function takes a dataframe containing various features over a sequence. This dataframe is
assumed to be 'in-order' that is, each row is 1 'unit' away from either row.
The input dataframe should have the following structure:
Sequence | 'Feat 1' | 'Feat 2' |
s1 | f1(s1) | f2(s1) |
s2 | f1(s2) | f2(s2) |
...
The function then returns for num_before=2, num_after=0, inc_date=True:
Sequence | ('Feat 1' , -2) | ('Feat 2', -2) | ('Feat 1', -1) | ('Feat 2', -1) | ('Feat 1', 0) | ('Feat 2', 0) |
s3 | f1(s1) | f2(s1) | f1(s2) | f2(s2) | f1(s3) | f2(s3) |
s4 | f1(s2) | f2(s2) | f1(s3) | f2(s3) | f1(s4) | f2(s4) |
...
"""
# Build segments
segments = []
columns = df.columns
for i in range(num_before,-num_after-1,-1):
segment = None
if i == 0:
if inc_val:
segment = df
else:
segment = df.shift(i)
if segment is not None:
segment.columns = pd.MultiIndex.from_product([columns,[-i]])
segments.append(segment)
# Join segments into full dataframe.
DF = pd.concat(segments, axis=1, join='outer').dropna()
return DF
def sequence_df(df, lags,
group_index_level=None, group_col=None,
sequence_index_level=None, sequence_col=None,
sequence_function=None):
def sequence_df(df, lags, group_specs):
"""
Sequence feature data into multi-component rows.
......@@ -182,57 +137,142 @@ def sequence_df(df, lags,
A pandas dataframe containing rows of prediction and/or label data.
"""
# group_index_level=None, group_col=None,
# sequence_index_level=None, sequence_col=None,
# sequence_function=None):
print("sequence_df start")
if group_index_level is None and group_col is None:
# Compute Group ids
G_ids = contiguous_group_indices(df, sequence_index_level=sequence_index_level, sequence_col=sequence_col, sequence_function=sequence_function)
else:
if type(group_col) is pd.core.series.Series:
G_ids = group_col
elif group_index_level is not None:
if group_index_level in df.index.names:
g_idx = df.index.names.index(group_index_level)
elif type(group_index_level) is int:
g_idx = group_index_level
# Build group sequences
# Group spec:
# ('group', 'level', 'level_name')
# ('group', 'column', 'column_name')
# ('sequence', 'level', 'level_name', <sequence_function>)
# ('sequence', 'column', 'column_name', <sequence_function>)
by = []
level = []
remove_columns = []
for spec in group_specs:
if len(spec) < 2:
raise ValueError("Group specs must contain at least three elements")
if spec[0] == 'group':
# These are group type specs
if spec[1] == 'level':
if spec[2] not in df.index.names:
raise ValueError(f"Level name {spec[2]} not found in index!")
level.append(spec[2])
elif spec[1] == 'column':
if type(spec[2]) is pd.core.series.Series:
by.append(spec[2])
else:
if spec[2] not in df.columns:
raise ValueError(f"Column name {spec[2]} not found!")
by.append(df[spec[2]])
remove_columns.append(spec[2])
else:
raise ValueError(f"group_index_level of type {type(group_index_level)} not supported")
G_ids = df.index.to_series().apply(lambda i: i[g_idx])
raise ValueError(f"Group subtype {spec[1]} not supported")
elif spec[0] == 'sequence':
# These are the sequence type specs
if spec[1] == 'level':
if spec[2] not in df.index.names:
raise ValueError(f"Level name {spec[2]} not found in index!")
if len(spec) == 4:
if not callable(spec[3]):
raise ValueError(f"The fourth element of a group spec must be a callable!")
g_ids = contiguous_group_indices(df, sequence_index_level=spec[2], sequence_function=spec[3])
else:
g_ids = contiguous_group_indices(df, sequence_index_level=spec[2])
by.append(g_ids)
elif spec[1] == 'column':
if type(spec[2]) is pd.core.series.Series:
if len(spec) == 4:
if not callable(spec[3]):
raise ValueError(f"The fourth element of a group spec must be a callable!")
g_ids = contiguous_group_indices(df, sequence_col=spec[2], sequence_function=spec[3])
else:
g_ids = contiguous_group_indices(df, sequence_col=spec[2])
else:
if spec[2] not in df.columns:
raise ValueError(f"Column name {spec[2]} not found!")
remove_columns.append(spec[2])
if len(spec) == 4:
if not callable(spec[3]):
raise ValueError(f"The fourth element of a group spec must be a callable!")
g_ids = contiguous_group_indices(df, sequence_col=spec[2], sequence_function=spec[3])
else:
g_ids = contiguous_group_indices(df, sequence_col=spec[2])
by.append(g_ids)
elif spec[1] == 'index':
if len(spec) == 3:
if not callable(spec[2]):
raise ValueError(f"The third element of an index group spec must be a callable!")
g_ids = contiguous_group_indices(df, sequence_function=spec[2])
else:
g_ids = contiguous_group_indices(df)
by.append(g_ids)
else:
raise ValueError(f"Group subtype {spec[1]} not supported")
else:
G_ids = df[group_col]
raise ValueError(f"Group spec of type {spec[0]} not supported")
print("Contiguous Ids")
print(G_ids)
# Remove columns
temp_df = df.loc[:,list(filter(lambda c: c not in remove_columns, df.columns))]
if sequence_col is not None and type(sequence_col) is not pd.core.series.Series:
# If sequence_col specifies a column of the dataFrame, we remove this column from the sequencing.
temp_df = df.loc[:,list(filter(lambda c: c != sequence_col, df.columns))]
elif group_col is not None and type(group_col) is not pd.core.series.Series:
# If group_col specifies a column of the dataFrame, we remove this column from the sequencing.
temp_df = df.loc[:,list(filter(lambda c: c != group_col, df.columns))]
else:
temp_df = df
print("temp_df:")
print("---temp_df---")
print(temp_df)
print(type(temp_df))
print(temp_df.dtypes)
print("---adjusting by and level---")
print("level")
print(level)
print("by")
print(by)
if len(level) == 0:
level = None
if len(by) == 0:
by = None
# There's a bug where if both by and level are passed to groupby, it throws an error:
# TypeError: 'numpy.ndarray' object is not callable
# We need to mitigate this by detecting if both by and level are non-zero and if so, transition the level values
# to columns and add them to by.
if level is not None and by is not None:
for lvl in level:
lvl_idx = df.index.names.index(lvl)
lvl_vals = df.index.to_series().apply(lambda t: t[lvl_idx])
by.append(lvl_vals)
level = None
print("---after adjustment---")
print("level:")
print(level)
print("by:")
print(by)
# Change column types to support nans
integer_columns = temp_df.dtypes[temp_df.dtypes.apply(pd.api.types.is_integer_dtype)]
for col_name in integer_columns.index:
print(f"1: col_name: {col_name}")
temp_df.loc[:,col_name] = temp_df.loc[:,col_name].astype(pd.Int64Dtype())
print("temp_df after integer column change")
print("---temp_df after integer column change---")
print(temp_df)
print(temp_df.dtypes)
print("group ids")
print(G_ids)
# Produce Groupby
# Groupby
temp_gbydf = temp_df.groupby(G_ids)
print("---before groupby---")
print(f"by")
print(by)
print(type(by))
print(f"level:")
print(level)
temp_gbydf = temp_df.groupby(by=by, level=level, axis=0)
print("---after groupby---")
dfs = []
......
import pandas as pd
import pandas_sequence as pds
import numpy as np
def test_basic_group_col_1_1():
# Build test DataFrame
......@@ -16,7 +17,7 @@ def test_basic_group_col_1_1():
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
sequenced_df = pds.sequence_df(df, [-1,0], [('group', 'column', 'Group')])
target_df = pd.DataFrame(
[[1,2],
......@@ -27,7 +28,7 @@ def test_basic_group_col_1_1():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_group_col_1_2():
# Build test DataFrame
......@@ -44,7 +45,7 @@ def test_basic_group_col_1_2():
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
sequenced_df = pds.sequence_df(df, [-1,0], [('group', 'column', 'Group')])
target_df = pd.DataFrame(
[[1,2],
......@@ -55,7 +56,7 @@ def test_basic_group_col_1_2():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_group_col_1_3():
# Build test DataFrame
......@@ -73,7 +74,7 @@ def test_basic_group_col_1_3():
)
group_col = pd.Series([1,1,1,1,2,2,2,3])
sequenced_df = pds.sequence_df(df, [-1,0], group_col=group_col)
sequenced_df = pds.sequence_df(df, [-1,0], [('group', 'column', group_col)])
target_df = pd.DataFrame(
[[1,2],
......@@ -84,7 +85,7 @@ def test_basic_group_col_1_3():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_1():
# Build test DataFrame
......@@ -101,7 +102,7 @@ def test_basic_1():
index=[0,1,2,3,5,6,7,9]
)
sequenced_df = pds.sequence_df(df, [-1,0])
sequenced_df = pds.sequence_df(df, [-1,0], [('sequence', 'index')])
target_df = pd.DataFrame(
[[1,2],
......@@ -112,7 +113,7 @@ def test_basic_1():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,6,7])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_group_index_1():
# Build test DataFrame
......@@ -138,7 +139,7 @@ def test_basic_group_index_1():
names=['Group', 'count'])
)
sequenced_df = pds.sequence_df(df, [-1,0], group_index_level='Group')
sequenced_df = pds.sequence_df(df, [-1,0], [('group', 'level', 'Group')])
target_df = pd.DataFrame(
[[1,2],
......@@ -155,7 +156,7 @@ def test_basic_group_index_1():
('B', 6)],
names=['Group', 'count']))
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_sequence_index_1():
# Build test DataFrame
......@@ -181,7 +182,7 @@ def test_basic_sequence_index_1():
names=['Group', 'count'])
)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_index_level='count')
sequenced_df = pds.sequence_df(df, [-1,0], [('sequence', 'level', 'count')])
target_df = pd.DataFrame(
[[1,2],
......@@ -198,7 +199,7 @@ def test_basic_sequence_index_1():
('B', 2)],
names=['Group', 'count']))
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_group_col_2_1():
# Build test DataFrame
......@@ -215,7 +216,7 @@ def test_basic_group_col_2_1():
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
sequenced_df = pds.sequence_df(df, [-1,0], [('group', 'column', 'Group')])
target_df = pd.DataFrame(
[[1,2],
......@@ -226,7 +227,7 @@ def test_basic_group_col_2_1():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_group_col_2_2():
# Build test DataFrame
......@@ -243,7 +244,7 @@ def test_basic_group_col_2_2():
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
sequenced_df = pds.sequence_df(df, [-1,0], [('group', 'column', 'Group')])
target_df = pd.DataFrame(
[[1,2],
......@@ -254,7 +255,7 @@ def test_basic_group_col_2_2():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_sequence_col_1_1():
# Build test DataFrame
......@@ -271,7 +272,7 @@ def test_basic_sequence_col_1_1():
index=[0,1,2,3,4,5,6,7]
)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_col='Seq')
sequenced_df = pds.sequence_df(df, [-1,0], [('sequence', 'column', 'Seq')])
target_df = pd.DataFrame(
[[1,2],
......@@ -282,7 +283,7 @@ def test_basic_sequence_col_1_1():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_sequence_col_1_2():
# Build test DataFrame
......@@ -302,7 +303,7 @@ def test_basic_sequence_col_1_2():
def str_diff(s, ref_val):
return ord(s)-ord(ref_val)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_col='Seq', sequence_function=str_diff)
sequenced_df = pds.sequence_df(df, [-1,0], [('sequence', 'column', 'Seq', str_diff)])
target_df = pd.DataFrame(
[[1,2],
......@@ -313,7 +314,7 @@ def test_basic_sequence_col_1_2():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_basic_sequence_col_1_3():
# Build test DataFrame
......@@ -335,7 +336,7 @@ def test_basic_sequence_col_1_3():
def str_diff(s, ref_val):
return ord(s)-ord(ref_val)
sequenced_df = pds.sequence_df(df, [-1,0], sequence_col=sequence_col, sequence_function=str_diff)
sequenced_df = pds.sequence_df(df, [-1,0], [('sequence', 'column', sequence_col, str_diff)])
target_df = pd.DataFrame(
[[1,2],
......@@ -346,32 +347,259 @@ def test_basic_sequence_col_1_3():
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
assert sequenced_df.equals(target_df)
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_large_group_col_1_1():
def test_large_1_1():
# Build test DataFrame
df = pd.DataFrame(
[[1,1],
[1,2],
[1,3],
[1,4],
[2,1],
[2,2],
[2,3],
[3,1]],
columns=['Group', 'Value'],
index=[0,1,2,3,4,5,6,7]
[['A', '2020-01-01', 0.2],
['A', '2020-01-02', -0.1],
['A', '2020-01-03', 0.345],
['A', '2020-01-04', 0.55],
['A', '2020-01-05', 1.2],
['B', '2020-01-01', -2.5],
['B', '2020-01-02', 3.7],
['B', '2020-01-03', 3.5],
['B', '2020-01-04', 0.3],
['B', '2020-01-05', -1.],
['C', '2020-01-01', -2.1],
['C', '2020-01-02', 1.1],
['C', '2020-01-03', 1.123],
['C', '2020-01-04', 5.3],
['D', '2020-01-01', 5.55]],
columns=['Group', 'Date', 'Value'],
)
df = df.set_index(['Group', 'Date'])
sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group')
sequenced_df = pds.sequence_df(df, [-1,0], [('group', 'level', 'Group')])
target_df = pd.DataFrame(
[[1,2],
[2,3],
[3,4],
[1,2],
[2,3]],
columns=pd.MultiIndex.from_product([['Value'],[-1,0]]),
index=[1,2,3,5,6])
[['A', '2020-01-02', 0.2, -0.1],
['A', '2020-01-03', -0.1, 0.345],
['A', '2020-01-04', 0.345, 0.55],
['A', '2020-01-05', 0.55, 1.2],
['B', '2020-01-02', -2.5, 3.7],
['B', '2020-01-03', 3.7, 3.5],
['B', '2020-01-04', 3.5, 0.3],
['B', '2020-01-05', 0.3, -1.],
['C', '2020-01-02', -2.1, 1.1],
['C', '2020-01-03', 1.1, 1.123],
['C', '2020-01-04', 1.123, 5.3]],
columns=['Group', 'Date', ('Value', -1), ('Value', 0)],
)
target_df = target_df.set_index(['Group', 'Date'])
target_df.columns = pd.MultiIndex.from_tuples(list(target_df.columns))
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_large_1_2():
# Build test DataFrame
df = pd.DataFrame(
[['A', '2020-01-01', 0.2],
['A', '2020-01-02', -0.1],
['A', '2020-01-03', 0.345],
['A', '2020-01-04', 0.55],
['A', '2020-01-05', 1.2],
['B', '2020-01-01', -2.5],
['B', '2020-01-02', 3.7],
['B', '2020-01-03', 3.5],
['B', '2020-01-04', 0.3],
['B', '2020-01-05', -1.],
['C', '2020-01-01', -2.1],
['C', '2020-01-02', 1.1],
['C', '2020-01-03', 1.123],
['C', '2020-01-04', 5.3],
['D', '2020-01-01', 5.55]],
columns=['Group', 'Date', 'Value'],
)
df = df.set_index(['Group', 'Date'])
sequenced_df = pds.sequence_df(df, [-2, 0, 1], [('group', 'level', 'Group')])
target_df = pd.DataFrame(
[['A', '2020-01-03', 0.2, 0.345, 0.55],
['A', '2020-01-04', -0.1, 0.55, 1.2],
['B', '2020-01-03', -2.5, 3.5, 0.3],
['B', '2020-01-04', 3.7, 0.3, -1.],
['C', '2020-01-03', -2.1, 1.123, 5.3]],
columns=['Group', 'Date', ('Value', -2), ('Value', 0), ('Value', 1)],
)
target_df = target_df.set_index(['Group', 'Date'])
target_df.columns = pd.MultiIndex.from_tuples(list(target_df.columns))
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_large_1_3():
# Build test DataFrame
df = pd.DataFrame(
[['A', '2020-01-01', 0.2],
['A', '2020-01-02', -0.1],
['A', '2020-01-03', 0.345],
['A', '2020-01-04', 0.55],
['A', '2020-01-05', 1.2],
['B', '2020-01-01', -2.5],
['B', '2020-01-02', 3.7],
['B', '2020-01-03', 3.5],
['B', '2020-01-04', 0.3],
['B', '2020-01-05', -1.],
['C', '2020-01-01', -2.1],
['C', '2020-01-02', 1.1],
['C', '2020-01-03', 1.123],
['C', '2020-01-04', 5.3],
['D', '2020-01-01', 5.55]],
columns=['Group', 'Date', 'Value'],
)
df = df.set_index(['Group', 'Date'])
sequenced_df = pds.sequence_df(df, [0,-1], [('group', 'level', 'Group')])
target_df = pd.DataFrame(
[['A', '2020-01-02', -0.1, 0.2],
['A', '2020-01-03', 0.345, -0.1],
['A', '2020-01-04', 0.55, 0.345],
['A', '2020-01-05', 1.2, 0.55],
['B', '2020-01-02', 3.7, -2.5],
['B', '2020-01-03', 3.5, 3.7],
['B', '2020-01-04', 0.3, 3.5],
['B', '2020-01-05', -1., 0.3],
['C', '2020-01-02', 1.1, -2.1],
['C', '2020-01-03', 1.123, 1.1],
['C', '2020-01-04', 5.3, 1.123]],
columns=['Group', 'Date', ('Value', 0), ('Value', -1)],
)
target_df = target_df.set_index(['Group', 'Date'])
target_df.columns = pd.MultiIndex.from_tuples(list(target_df.columns))
pd.testing.assert_frame_equal(sequenced_df, target_df)
def test_large_1_4():
# Build test DataFrame
df = pd.DataFrame(
[['A', '2020-01-01', 0.2],
['A', '2020-01-02', -0.1],
['A', '2020-01-03', 0.345],
['A', '2020-01-04', 0.55],
['A', '2020-01-05', 1.2],
['B', '2020-01-01', -2.5],
['B', '2020-01-02', 3.7],
['B', '2020-01-03', 3.5],
['B', '2020-01-04', 0.3],
['B', '2020-01-05', -1.],
['C', '2020-01-01', -2.1],
['C', '2020-01-02', 1.1],
['C', '2020-01-03', 1.123],
['C', '2020-01-04', 5.3],
['D', '2020-01-01', 5.55]],
columns=['Group', 'Date', 'Value'],
)
df = df.set_index(['Group', 'Date'])
sequenced_df = pds.sequence_df(df, [-2, -1, 0], [('group', 'level', 'Group')])
target_df = pd.DataFrame(
[['A', '2020-01-03', 0.2, -0.1, 0.345],
['A', '2020-01-04', -0.1, 0.345, 0.55],
['A', '2020-01-05', 0.345, 0.55, 1.2],
['B', '2020-01-03', -2.5, 3.7, 3.5],
['B', '2020-01-04', 3.7, 3.5, 0.3],
['B', '2020-01-05', 3.5, 0.3, -1.],