Patching Dec 9, 2021 6-7a CST- All GitLab services may be unavailable for 5-10 minutes

Commit e6a6e1cf authored by Matthew Krafczyk's avatar Matthew Krafczyk
Browse files

Update docs, remove extraneous print statements, update version

parent c00423df
from .sequence import sequence_df, contiguous_group_indices
__version__ = "0.1.0"
__version__ = "0.2.0"
__all__ = [ "sequence_df", "contiguous_group_indices" ]
......@@ -114,7 +114,7 @@ def sequence_df(df, lags, group_specs):
s2 | f1(s2) | f2(s2) |
...
The function then returns for num_before=2, num_after=0, inc_date=True:
The function then returns for a set of lags:
Sequence | ('Feat 1' , -2) | ('Feat 2', -2) | ('Feat 1', -1) | ('Feat 2', -1) | ('Feat 1', 0) | ('Feat 2', 0) |
s3 | f1(s1) | f2(s1) | f1(s2) | f2(s2) | f1(s3) | f2(s3) |
......@@ -127,29 +127,26 @@ def sequence_df(df, lags, group_specs):
Named Arguments
df: A Pandas dataframe containing a set of features for each day
lags: A list of lags to include
num_before: The number of before the first predicted day needed for a prediction.
num_after: The number of days after the first predicted day
sequence_col: The column to use, if None will use the index.
sequence_function: A function to use to compute sequence differences. If None, it'll just take the difference.
This function should have behavior like this:
group_specs: A list of tuples defining how groups are discovered.
'group' type specs - Group type specs specify columns, or index levels where the groups are already defined.
'sequence' type specs - Sequence type specs specify 'sequencable' columns. These columns have a 'by-one' well ordering defined.
This well ordering can either be implicit if you use integers, or you can pass a function which defines it.
'level' subtype specs - These specs indicate that the data passed indicate a specific level of the data frame's index should be used.
'column' subtype specs - These specs indicate that the data passed with the spec is a column of some type. Either a name or a column type.
'index' subtype specs - sequence specs also support the index subtype. This indicates to just use the index of the data frame.
A spec is specified like so: (<type>, <subtype>, data, [<sequence_function>])
A few examples:
('group', 'level', 'Security') - Use the 'Security' index level as a pre-defined grouping
('sequence', 'level', 'Date', days_diff) - Use the 'Date' index level as a sequencable column to define a grouping. Use the days_diff function to define the order
('sequence', 'index') - Use the index of the dataframe as a sequencable column. Since no function is specified, it will just use arithmetic.
('group', 'column', groups) - Use the groups series to define the groups to use. This is a column passed in.
('group', 'column', 'Group') - Use the 'Group' column of the dataframe to define the groups to use. This is a column passed in.
returns
A pandas dataframe containing rows of prediction and/or label data.
"""
# group_index_level=None, group_col=None,
# sequence_index_level=None, sequence_col=None,
# sequence_function=None):
print("sequence_df start")
# Build group sequences
# Group spec:
# ('group', 'level', 'level_name')
# ('group', 'column', 'column_name')
# ('sequence', 'level', 'level_name', <sequence_function>)
# ('sequence', 'column', 'column_name', <sequence_function>)
by = []
level = []
remove_columns = []
......@@ -219,17 +216,6 @@ def sequence_df(df, lags, group_specs):
# Remove columns
temp_df = df.loc[:,list(filter(lambda c: c not in remove_columns, df.columns))]
print("---temp_df---")
print(temp_df)
print(type(temp_df))
print(temp_df.dtypes)
print("---adjusting by and level---")
print("level")
print(level)
print("by")
print(by)
if len(level) == 0:
level = None
if len(by) == 0:
......@@ -247,70 +233,30 @@ def sequence_df(df, lags, group_specs):
by.append(lvl_vals)
level = None
print("---after adjustment---")
print("level:")
print(level)
print("by:")
print(by)
# Change column types to support nans
integer_columns = temp_df.dtypes[temp_df.dtypes.apply(pd.api.types.is_integer_dtype)]
for col_name in integer_columns.index:
print(f"1: col_name: {col_name}")
temp_df.loc[:,col_name] = temp_df.loc[:,col_name].astype(pd.Int64Dtype())
print("---temp_df after integer column change---")
print(temp_df)
print(temp_df.dtypes)
# Produce Groupby
# Groupby
print("---before groupby---")
print(f"by")
print(by)
print(type(by))
print(f"level:")
print(level)
temp_gbydf = temp_df.groupby(by=by, level=level, axis=0)
print("---after groupby---")
dfs = []
print("After groupby")
print(type(temp_gbydf))
print(temp_gbydf.shift(0))
print(temp_gbydf.shift(1))
print(temp_gbydf.shift(2))
print("before for loop")
for lag in lags:
print(f"loop lag: {lag}")
slice_df = temp_gbydf.shift(-lag)
print(slice_df)
print(type(slice_df))
print("loop 2")
if slice_df is not None:
print("loop 3")
slice_df.columns = pd.MultiIndex.from_product([slice_df.columns,[lag]])
print("loop 4")
dfs.append(slice_df)
print("loop 5")
# Join segments into full dataframe.
DF = pd.concat(dfs, axis=1, join='outer').dropna()
# Restore the original types of the integer columns
print("DF")
print(DF)
print("columns")
for col in DF.columns:
print(col)
if col[0] in integer_columns.index:
DF[col] = DF[col].astype(integer_columns.loc[col[0]])
print("sequence_df end")
return DF
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment