Commit f0f2e862 by Matthew Krafczyk

### Separate out function to produce contiguous group identification

parent 8c155bed
 import pandas as pd def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_function=None): """ Produce series of continuous group labels for a given sequence column and sequencing function Suppose a column or index level contains a 'sequencable value'. This may be any value with a countable number of discrete elements which may be ordered 'by 1'. This function produces a series (with index matching the df) containing integers indicating contiguous groups of values. This is useful for proper column shift logic. Ex: Consider the following DataFrame of Security prices. It has a multi-level index whose second level is a sequencable value 'Quarter'. Security Quarter Price Sec-1 2019Q1 10. 2019Q2 11. 2019Q4 10.5 2020Q1 10.6 2020Q2 10.7 Sec-2 2018Q1 25 2018Q2 24 2018Q3 25 2018Q4 26 2019Q2 20 Passing index_level='Quarter' and a sequence function like yq_diff: def yr(quarter): return int(quarter[:4]) def mon(quarter): return int(quarter[5:]) def yq_diff(yq, yq_ref): return ((yr(yq)*4+mon(yq))-((yr(yq_ref)*4)+mon(yq_ref))) grp_ids = contiguous_group_indices(df, index_level='Quarter', sequence_function=yq_diff) Yields grp_ids as: Security Quarter Sec-1 2019Q1 1 2019Q2 1 2019Q4 2 2020Q1 2 2020Q2 2 Sec-2 2018Q1 3 2018Q2 3 2018Q3 3 2018Q4 3 2019Q2 4 The series indicates groups of contiguous values. We can find differences in price to previous quarters properly respecting gaps when they pop up. df['Price-diff'] = df['Price']-df['Price'].groupby(grp_ids).shift(1) """ # Fetch sequence series sequence_series = None if sequence_col is None: # Get the sequence series sequence_series = df.index.to_series() elif type(sequence_col) is pd.core.series.Series: sequence_series = sequence_col elif index_level is not None: if index_level not in df.index.names: raise ValueError(f"index_level {index_level} not in the data frame. available levels: {df.index.names}") level_idx = df.index.names.index(index_level) sequence_series = df.index.to_series().apply(lambda t: t[level_idx]) else: # Get the sequence col sequence_series = df[sequence_col] # If the sequence function is None, set it as the simple difference formula if sequence_function is None: sequence_function_ = lambda s: s-ref_val else: sequence_function_ = lambda s: sequence_function(s, ref_val) # Compute differences against 'reference' value ref_val = sequence_series.iloc[0] try: sequence_values = sequence_series.apply(sequence_function_) except Exception as e: print(f"Tried to subtract sequence values but ran into an error!") raise e # Check that sequence is an integer type if not pd.api.types.is_integer_dtype(sequence_values.dtype): raise TypeError(f"Sequence value type: {sequence_values.dtype} is not an integer type!") # Detect sequential groups S = (sequence_values-sequence_values.shift(1)).fillna(0.0).astype(int) # Group ids # This procedure may fail if the selected index level isn't the 'lowest'. grp_ids = (S != 1).cumsum() return grp_ids def sequence_plain_df(df, num_before, num_after, inc_val=True): """ Sequence feature data into multi-component rows. ... ... @@ -42,7 +141,7 @@ def sequence_plain_df(df, num_before, num_after, inc_val=True): return DF def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=True, sequence_col=None, sequence_function=None): def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=True, index_val=None, sequence_col=None, sequence_function=None): """ Sequence feature data into multi-component rows. ... ... @@ -83,49 +182,9 @@ def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=T A pandas dataframe containing rows of prediction and/or label data. """ # Fetch sequence series sequence_series = None if sequence_col is None: # Get the sequence series sequence_series = df.index.to_series() elif type(sequence_col) is pd.core.series.Series: sequence_series = sequence_col else: # Get the sequence col sequence_series = df[sequence_col] # Compute Group ids G_ids = contiguous_group_indices(df, index_val, sequence_col, sequence_function) # If the sequence function is None, set it as the simple difference formula if sequence_function is None: sequence_function_ = lambda s: s-ref_val else: sequence_function_ = lambda s: sequence_function(s, ref_val) # Compute differences against 'reference' value ref_val = sequence_series.iloc[0] try: sequence_values = sequence_series.apply(sequence_function_) except Exception as e: print(f"Tried to subtract sequence values but ran into an error!") raise e # Check that sequence is an integer type if not pd.api.types.is_integer_dtype(sequence_values.dtype): raise TypeError(f"Sequence value type: {sequence_values.dtype} is not an integer type!") # Restrict the DF if necessary restricted_df = df if beg_val is not None: beg_val_diff = sequence_function_(beg_val) restricted_df = restricted_df[sequence_values >= beg_val_diff] if end_val is not None: end_val_diff = sequence_function_(end_val) restricted_df = restricted_df[sequence_values <= end_val_diff] # Detect sequential groups S = (sequence_values-sequence_values.shift(1)).fillna(0.0).astype(int) # Group ids G_ids = (S != 1).cumsum() dfs = [] max_id = G_ids.max() num_skipped = 0 ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment