Source code for slitflow.setreqs

"""
This module includes functions used in the set_reqs method of the Data class.
The required data must be sorted to align the correspondence between the data.
"""

import numpy as np
import pandas as pd

from .fun.misc import reduce_list as rl


[docs]def fit_1to0(reqs):
    """Keep reqs[0] data even that doesn't contain in reqs[1].

    This function can be used to render movies with trajectories that some
    frames doesn't contain any trajectories.

    Args:
        reqs (list): List of required data with any data type.

    Returns:
        list: List of selected required data
    """
    index_0 = reqs[0].info.file_index().copy()
    index_1 = reqs[1].info.file_index().copy()
    cols = [col for col in index_0.columns if col in index_1.columns]
    cols = [col for col in cols if col not in ["_file", "_split"]]
    index_0 = index_0[cols].drop_duplicates().reset_index(drop=True)
    index_1 = index_1[cols].drop_duplicates().reset_index(drop=True)

    index_mrg = pd.concat([index_0, index_1], axis=0)
    to_put = index_mrg.duplicated(keep=False).iloc[:len(index_0)]
    pos_put = to_put.reset_index(drop=True).index[to_put].to_list()
    sort_data = [None for _ in range(len(index_0))]
    for pos, data in zip(pos_put, reqs[1].data):
        sort_data[pos] = data
    reqs[1].data = sort_data
    return reqs


[docs]def copy_1to0(reqs):
    """Sort reqs[1] according to the reqs[0] data structure.

    This function can be used if reqs[0] is split into multiple files while
    reqs[1] is not. reqs[1] is selected to fit reqs[0] data.

    Args:
        reqs (list): List of required data with any data type.

    Returns:
        list: List of selected required data
    """
    index_0 = reqs[0].info.file_index().copy()
    index_1 = reqs[1].info.file_index().copy()
    index_0 = index_0.groupby("_split").head(1)
    index_1 = index_1.rename(columns={'_split': '_split_1'})
    index_mrg = index_0.merge(index_1)
    pos_list = index_mrg["_split_1"].values
    sort_data = [None for _ in range(len(index_0))]
    for i, pos in enumerate(pos_list):
        sort_data[i] = reqs[1].data[pos]
    reqs[1].data = sort_data
    return reqs


[docs]def and_2reqs(reqs):
    """Drop elements that exist only in one required data.

    Args:
        reqs (list): List of required data with any data type.

    Returns:
        list: List of selected required data
    """
    index_0 = reqs[0].info.file_index().copy()
    index_1 = reqs[1].info.file_index().copy()
    cols = [col for col in index_0.columns if col in index_1.columns]
    cols = [col for col in cols if col not in ["_file", "_split"]]
    index_0 = index_0[cols].drop_duplicates().reset_index(drop=True)
    index_1 = index_1[cols].drop_duplicates().reset_index(drop=True)

    index_mrg = pd.concat([index_0, index_1], axis=0)
    to_pick_0 = index_mrg.duplicated(keep=False).iloc[:len(index_0)]
    reqs_0 = []
    for i in range(len(to_pick_0)):
        if to_pick_0[i]:
            reqs_0.append(reqs[0].data[i])
    to_pick_1 = index_mrg.duplicated(keep=False).iloc[len(index_0):]
    reqs_1 = []
    for i in range(len(to_pick_1)):
        if to_pick_1[i]:
            reqs_1.append(reqs[1].data[i])
    reqs[0].data = reqs_0
    reqs[1].data = reqs_1

    return reqs


[docs]def set_cols(index):
    """Return column names without _file and _split columns from index table.

    Args:
        index (pandas.DataFrame): Index table.

    Returns:
        list of str: List of column names
    """
    cols = [col for col in list(index.columns) if col not in [
        "_file", "_split"]]
    return cols


[docs]def set_reqs_file_nos(reqs, split_depth):
    """Get file numbers of required split data and save data.

    Args:
        reqs (list of Data): List of split required data.
        split_depth (int): Split depth of result data.

    Returns:
        tuple of list of int: (reqs_file_nos, save_file_nos)
    """
    if len(reqs) == 0:
        return [], []
    indexes = []

    # get column list
    cols_list = []
    for req in reqs:
        index = req.info.index
        cols = list(index.columns)
        indexes.append(index)
        cols_list.append(
            [col for col in list(index.columns) if col not in [
                "_file", "_split"]])

    # get common columns
    col_common = cols_list[0]
    for cols in cols_list:
        col_common = [col for col in cols if col in col_common]
    col_common = col_common + ["_file"]

    # set common index table
    indexes_common = []
    for i, index in enumerate(indexes):
        index = index[col_common].drop_duplicates().reset_index(drop=True)
        index = index.rename(columns={'_file': '_file_' + str(i)})
        indexes_common.append(index)
    index_mrg = indexes_common[0]
    for index in indexes_common:
        index_mrg = pd.merge(index_mrg, index, how='outer')

    # set save file no
    if split_depth > 0:
        grouped = index_mrg.groupby(rl(col_common[:split_depth]))
        dfs = list(list(zip(*grouped))[1])
        for i, _ in enumerate(dfs):
            dfs[i]["_file"] = i
        index_mrg = pd.concat(dfs)
    else:
        index_mrg["_file"] = 0
    index_mrg = index_mrg.drop(col_common[:-1], axis=1).dropna()\
        .drop_duplicates().reset_index(drop=True)

    # get reqs_no as list of integers
    reqs_no = index_mrg.values[:, :-1].astype(np.float64)
    # TODO: (Future) This code reduces repeat load but does not work without
    #   data stashing during set_reqs.
    # for row in range(1, reqs_no.shape[0]):
    #     for col in range(reqs_no.shape[1]):
    #         if reqs_no[row, col] == reqs_no[row - 1, col]:
    #             reqs_no[row, col] = None
    #         elif np.isnan(reqs_no[row - 1, col]):
    #             reqs_no[row, col] = None
    save_no = index_mrg.values[:, -1].astype(np.float64)
    for row in range(len(save_no) - 1):
        if save_no[row] == save_no[row + 1]:
            save_no[row] = None
    return reqs_no, save_no