Commit 796d28ff by Matthew Krafczyk

### re-organize, update code, add tests

parent cb99efe2
 import pandas as pd def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_function=None): def contiguous_group_indices(df, sequence_index_level=None, sequence_col=None, sequence_function=None): """ Produce series of continuous group labels for a given sequence column and sequencing function ... ... @@ -26,7 +26,7 @@ def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_f 2018Q4 26 2019Q2 20 Passing index_level='Quarter' and a sequence function like yq_diff: Passing sequence_index_level='Quarter' and a sequence function like yq_diff: def yr(quarter): return int(quarter[:4]) def mon(quarter): ... ... @@ -34,7 +34,7 @@ def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_f def yq_diff(yq, yq_ref): return ((yr(yq)*4+mon(yq))-((yr(yq_ref)*4)+mon(yq_ref))) grp_ids = contiguous_group_indices(df, index_level='Quarter', sequence_function=yq_diff) grp_ids = contiguous_group_indices(df, sequence_index_level='Quarter', sequence_function=yq_diff) Yields grp_ids as: ... ... @@ -63,11 +63,11 @@ def contiguous_group_indices(df, index_level=None, sequence_col=None, sequence_f # Use the sequence column if it's a Series sequence_series = sequence_col else: sequence_series = df[sequence_col] elif index_level is not None: if index_level not in df.index.names: raise ValueError(f"index_level {index_level} not in the data frame. available levels: {df.index.names}") level_idx = df.index.names.index(index_level) sequence_series = df.loc[:,sequence_col] elif sequence_index_level is not None: if sequence_index_level not in df.index.names: raise ValueError(f"sequence_index_level {sequence_index_level} not in the data frame. available levels: {df.index.names}") level_idx = df.index.names.index(sequence_index_level) sequence_series = df.index.to_series().apply(lambda t: t[level_idx]) else: # Get the dataframe index ... ... @@ -142,7 +142,10 @@ def sequence_plain_df(df, num_before, num_after, inc_val=True): return DF def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=True, index_level=None, sequence_col=None, sequence_function=None): def sequence_df(df, lags, group_index_level=None, group_col=None, sequence_index_level=None, sequence_col=None, sequence_function=None): """ Sequence feature data into multi-component rows. ... ... @@ -168,39 +171,106 @@ def sequence_df(df, num_before, num_after, beg_val=None, end_val=None, inc_val=T Named Arguments df: A Pandas dataframe containing a set of features for each day beg_val: The first sequence value for which sequences are needed end_val: The last sequence value for which sequences are needed lags: A list of lags to include num_before: The number of before the first predicted day needed for a prediction. num_after: The number of days after the first predicted day sequence_col: The column to use, if None will use the index. sequence_function: A function to use to compute sequence differences. If None, it'll just take the difference. This function should have behavior like this: Keyword Arguments inc_date: Whether to include data from the first predicted day returns A pandas dataframe containing rows of prediction and/or label data. """ # Compute Group ids G_ids = contiguous_group_indices(df, index_level=index_level, sequence_col=sequence_col, sequence_function=sequence_function) print("sequence_df start") dfs = [] max_id = G_ids.max() num_skipped = 0 for g_id in range(1,max_id+1): # For each group, first, check how many rows there are. num_in_group = (G_ids == g_id).sum() if num_in_group >= num_before+num_after+(1 if inc_val else 0): # This group has enough data. dfs.append(sequence_plain_df(df[G_ids == g_id], num_before, num_after, inc_val)) if group_index_level is None and group_col is None: # Compute Group ids G_ids = contiguous_group_indices(df, sequence_index_level=sequence_index_level, sequence_col=sequence_col, sequence_function=sequence_function) else: if type(group_col) is pd.core.series.Series: G_ids = group_col elif group_index_level is not None: if group_index_level in df.index.names: g_idx = df.index.names.index(group_index_level) elif type(group_index_level) is int: g_idx = group_index_level else: raise ValueError(f"group_index_level of type {type(group_index_level)} not supported") G_ids = df.index.to_series().apply(lambda i: i[g_idx]) else: num_skipped += 1 print(f"Skipped {num_skipped} groups when building dataframe") if len(dfs) == 0: raise RuntimeError("There was no data! to combine!") print(f"Combined {len(dfs)} groups") return pd.concat(dfs, axis=0) G_ids = df[group_col] print("Contiguous Ids") print(G_ids) if sequence_col is not None and type(sequence_col) is not pd.core.series.Series: # If sequence_col specifies a column of the dataFrame, we remove this column from the sequencing. temp_df = df.loc[:,list(filter(lambda c: c != sequence_col, df.columns))] elif group_col is not None and type(group_col) is not pd.core.series.Series: # If group_col specifies a column of the dataFrame, we remove this column from the sequencing. temp_df = df.loc[:,list(filter(lambda c: c != group_col, df.columns))] else: temp_df = df print("temp_df:") print(temp_df) print(type(temp_df)) print(temp_df.dtypes) # Change column types to support nans integer_columns = temp_df.dtypes[temp_df.dtypes.apply(pd.api.types.is_integer_dtype)] for col_name in integer_columns.index: print(f"1: col_name: {col_name}") temp_df.loc[:,col_name] = temp_df.loc[:,col_name].astype(pd.Int64Dtype()) print("temp_df after integer column change") print(temp_df) print(temp_df.dtypes) print("group ids") print(G_ids) # Groupby temp_gbydf = temp_df.groupby(G_ids) dfs = [] print("After groupby") print(type(temp_gbydf)) print(temp_gbydf.shift(0)) print(temp_gbydf.shift(1)) print(temp_gbydf.shift(2)) print("before for loop") for lag in lags: print(f"loop lag: {lag}") slice_df = temp_gbydf.shift(-lag) print(slice_df) print(type(slice_df)) print("loop 2") if slice_df is not None: print("loop 3") slice_df.columns = pd.MultiIndex.from_product([slice_df.columns,[lag]]) print("loop 4") dfs.append(slice_df) print("loop 5") # Join segments into full dataframe. DF = pd.concat(dfs, axis=1, join='outer').dropna() # Restore the original types of the integer columns print("DF") print(DF) print("columns") for col in DF.columns: print(col) if col[0] in integer_columns.index: DF[col] = DF[col].astype(integer_columns.loc[col[0]]) print("sequence_df end") return DF
 import pandas as pd import pandas_sequence as pds def test_basic_group_col_1_1(): # Build test DataFrame df = pd.DataFrame( [[1,1], [1,2], [1,3], [1,4], [2,1], [2,2], [2,3], [3,1]], columns=['Group', 'Value'], index=[0,1,2,3,4,5,6,7] ) sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_basic_group_col_1_2(): # Build test DataFrame df = pd.DataFrame( [['A',1], ['A',2], ['A',3], ['A',4], ['B',1], ['B',2], ['B',3], ['C',1]], columns=['Group', 'Value'], index=[0,1,2,3,4,5,6,7] ) sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_basic_group_col_1_3(): # Build test DataFrame df = pd.DataFrame( [[1], [2], [3], [4], [1], [2], [3], [1]], columns=['Value'], index=[0,1,2,3,4,5,6,7] ) group_col = pd.Series([1,1,1,1,2,2,2,3]) sequenced_df = pds.sequence_df(df, [-1,0], group_col=group_col) target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_basic_1(): # Build test DataFrame df = pd.DataFrame( [[1], [2], [3], [4], [1], [2], [3], [1]], columns=['Value'], index=[0,1,2,3,5,6,7,9] ) sequenced_df = pds.sequence_df(df, [-1,0]) target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,6,7]) assert sequenced_df.equals(target_df) def test_basic_group_index_1(): # Build test DataFrame df = pd.DataFrame( [[1], [2], [3], [4], [1], [2], [3], [1]], columns=['Value'], index=pd.MultiIndex.from_tuples([ ('A', 0), ('A', 1), ('A', 2), ('A', 3), ('B', 4), ('B', 5), ('B', 6), ('C', 7)], names=['Group', 'count']) ) sequenced_df = pds.sequence_df(df, [-1,0], group_index_level='Group') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=pd.MultiIndex.from_tuples([ ('A', 1), ('A', 2), ('A', 3), ('B', 5), ('B', 6)], names=['Group', 'count'])) assert sequenced_df.equals(target_df) def test_basic_sequence_index_1(): # Build test DataFrame df = pd.DataFrame( [[1], [2], [3], [4], [1], [2], [3], [1]], columns=['Value'], index=pd.MultiIndex.from_tuples([ ('A', 0), ('A', 1), ('A', 2), ('A', 3), ('B', 0), ('B', 1), ('B', 2), ('C', 0)], names=['Group', 'count']) ) sequenced_df = pds.sequence_df(df, [-1,0], sequence_index_level='count') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=pd.MultiIndex.from_tuples([ ('A', 1), ('A', 2), ('A', 3), ('B', 1), ('B', 2)], names=['Group', 'count'])) assert sequenced_df.equals(target_df) def test_basic_group_col_2_1(): # Build test DataFrame df = pd.DataFrame( [[1,1], [1,2], [1,3], [1,4], [2,5], [2,6], [2,7], [3,8]], columns=['Group', 'Value'], index=[0,1,2,3,4,5,6,7] ) sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [5,6], [6,7]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_basic_group_col_2_2(): # Build test DataFrame df = pd.DataFrame( [['A',1], ['A',2], ['A',3], ['A',4], ['B',5], ['B',6], ['B',7], ['C',8]], columns=['Group', 'Value'], index=[0,1,2,3,4,5,6,7] ) sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [5,6], [6,7]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_basic_sequence_col_1_1(): # Build test DataFrame df = pd.DataFrame( [[1,1], [2,2], [3,3], [4,4], [1,1], [2,2], [3,3], [1,1]], columns=['Seq', 'Value'], index=[0,1,2,3,4,5,6,7] ) sequenced_df = pds.sequence_df(df, [-1,0], sequence_col='Seq') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_basic_sequence_col_1_2(): # Build test DataFrame df = pd.DataFrame( [['A',1], ['B',2], ['C',3], ['D',4], ['A',1], ['B',2], ['C',3], ['A',1]], columns=['Seq', 'Value'], index=[0,1,2,3,4,5,6,7] ) def str_diff(s, ref_val): return ord(s)-ord(ref_val) sequenced_df = pds.sequence_df(df, [-1,0], sequence_col='Seq', sequence_function=str_diff) target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_basic_sequence_col_1_3(): # Build test DataFrame df = pd.DataFrame( [[1], [2], [3], [4], [1], [2], [3], [1]], columns=['Value'], index=[0,1,2,3,4,5,6,7] ) sequence_col = pd.Series(['A', 'B', 'C', 'D', 'A', 'B', 'C', 'A']) def str_diff(s, ref_val): return ord(s)-ord(ref_val) sequenced_df = pds.sequence_df(df, [-1,0], sequence_col=sequence_col, sequence_function=str_diff) target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df) def test_large_group_col_1_1(): # Build test DataFrame df = pd.DataFrame( [[1,1], [1,2], [1,3], [1,4], [2,1], [2,2], [2,3], [3,1]], columns=['Group', 'Value'], index=[0,1,2,3,4,5,6,7] ) sequenced_df = pds.sequence_df(df, [-1,0], group_col='Group') target_df = pd.DataFrame( [[1,2], [2,3], [3,4], [1,2], [2,3]], columns=pd.MultiIndex.from_product([['Value'],[-1,0]]), index=[1,2,3,5,6]) assert sequenced_df.equals(target_df)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!