Patching Dec 9, 2021 6-7a CST- All GitLab services may be unavailable for 5-10 minutes

Commit f0f2e862 authored by Matthew Krafczyk's avatar Matthew Krafczyk
Browse files

Separate out function to produce contiguous group identification

parent 8c155bed
import pandas as pd
def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_function=None):
"""
Produce series of continuous group labels for a given sequence column and sequencing function
Suppose a column or index level contains a 'sequencable value'. This may be any
value with a countable number of discrete elements which may be ordered 'by 1'.
This function produces a series (with index matching the df) containing integers indicating
contiguous groups of values. This is useful for proper column shift logic.
Ex:
Consider the following DataFrame of Security prices. It has a multi-level
index whose second level is a sequencable value 'Quarter'.
Security Quarter Price
Sec-1 2019Q1 10.
2019Q2 11.
2019Q4 10.5
2020Q1 10.6
2020Q2 10.7
Sec-2 2018Q1 25
2018Q2 24
2018Q3 25
2018Q4 26
2019Q2 20
Passing index_level='Quarter' and a sequence function like yq_diff:
def yr(quarter):
return int(quarter[:4])
def mon(quarter):
return int(quarter[5:])
def yq_diff(yq, yq_ref):
return ((yr(yq)*4+mon(yq))-((yr(yq_ref)*4)+mon(yq_ref)))
grp_ids = contiguous_group_indices(df, index_level='Quarter', sequence_function=yq_diff)
Yields grp_ids as:
Security Quarter
Sec-1 2019Q1 1
2019Q2 1
2019Q4 2
2020Q1 2
2020Q2 2
Sec-2 2018Q1 3
2018Q2 3
2018Q3 3
2018Q4 3
2019Q2 4
The series indicates groups of contiguous values. We can find differences in price to
previous quarters properly respecting gaps when they pop up.
df['Price-diff'] = df['Price']-df['Price'].groupby(grp_ids).shift(1)
"""
# Fetch sequence series
sequence_series = None
if sequence_col is None:
# Get the sequence series
sequence_series = df.index.to_series()
elif type(sequence_col) is pd.core.series.Series:
sequence_series = sequence_col
elif index_level is not None:
if index_level not in df.index.names:
raise ValueError(f"index_level {index_level} not in the data frame. available levels: {df.index.names}")
level_idx = df.index.names.index(index_level)
sequence_series = df.index.to_series().apply(lambda t: t[level_idx])
else:
# Get the sequence col
sequence_series = df[sequence_col]
# If the sequence function is None, set it as the simple difference formula
if sequence_function is None:
sequence_function_ = lambda s: s-ref_val
else:
sequence_function_ = lambda s: sequence_function(s, ref_val)
# Compute differences against 'reference' value
ref_val = sequence_series.iloc[0]
try:
sequence_values = sequence_series.apply(sequence_function_)
except Exception as e:
print(f"Tried to subtract sequence values but ran into an error!")
raise e
# Check that sequence is an integer type
if not pd.api.types.is_integer_dtype(sequence_values.dtype):
raise TypeError(f"Sequence value type: {sequence_values.dtype} is not an integer type!")
# Detect sequential groups
S = (sequence_values-sequence_values.shift(1)).fillna(0.0).astype(int)
# Group ids
# This procedure may fail if the selected index level isn't the 'lowest'.
grp_ids = (S != 1).cumsum()
return grp_ids
def sequence_plain_df(df, num_before, num_after, inc_val=True):
"""
Sequence feature data into multi-component rows.
......@@ -42,7 +141,7 @@ def sequence_plain_df(df, num_before, num_after, inc_val=True):
return DF
def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=True, sequence_col=None, sequence_function=None):
def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=True, index_val=None, sequence_col=None, sequence_function=None):
"""
Sequence feature data into multi-component rows.
......@@ -83,49 +182,9 @@ def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=T
A pandas dataframe containing rows of prediction and/or label data.
"""
# Fetch sequence series
sequence_series = None
if sequence_col is None:
# Get the sequence series
sequence_series = df.index.to_series()
elif type(sequence_col) is pd.core.series.Series:
sequence_series = sequence_col
else:
# Get the sequence col
sequence_series = df[sequence_col]
# Compute Group ids
G_ids = contiguous_group_indices(df, index_val, sequence_col, sequence_function)
# If the sequence function is None, set it as the simple difference formula
if sequence_function is None:
sequence_function_ = lambda s: s-ref_val
else:
sequence_function_ = lambda s: sequence_function(s, ref_val)
# Compute differences against 'reference' value
ref_val = sequence_series.iloc[0]
try:
sequence_values = sequence_series.apply(sequence_function_)
except Exception as e:
print(f"Tried to subtract sequence values but ran into an error!")
raise e
# Check that sequence is an integer type
if not pd.api.types.is_integer_dtype(sequence_values.dtype):
raise TypeError(f"Sequence value type: {sequence_values.dtype} is not an integer type!")
# Restrict the DF if necessary
restricted_df = df
if beg_val is not None:
beg_val_diff = sequence_function_(beg_val)
restricted_df = restricted_df[sequence_values >= beg_val_diff]
if end_val is not None:
end_val_diff = sequence_function_(end_val)
restricted_df = restricted_df[sequence_values <= end_val_diff]
# Detect sequential groups
S = (sequence_values-sequence_values.shift(1)).fillna(0.0).astype(int)
# Group ids
G_ids = (S != 1).cumsum()
dfs = []
max_id = G_ids.max()
num_skipped = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment