Source code for logflow.relationsdiscover.Dataset

# Copyright 2020 BULL SAS All rights reserved #
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import word2vec # type: ignore
import h5py # type: ignore
from collections import Counter
from loguru import logger
import os
import time
from multiprocessing import Pool
from tqdm import tqdm # type: ignore
from logflow.relationsdiscover.Cardinality import Cardinality
import pickle 
from typing import List

[docs]class Dataset: """Load the files and create the cardinalities Args: path_model (str, optional): path to the word2vec model. Defaults to "". path_data (str, optional): path to the data (list of patterns). Defaults to "". name_dataset (str, optional): name of the dataset to load. Defaults to "". size (int, optional): number of examples to load. Defaults to -1. one_model (bool, optional): use one global model instead of one model per cardinality. Raises: Exception: model file is not found Exception: data file is not found """ def __init__(self, path_model="", path_data="", name_dataset="", size=-1, one_model=False): assert path_model != "" assert path_data != "" assert name_dataset != "" self.path_model = path_model self.path_data = path_data self.name_dataset = name_dataset self.path_model_w2v = self.path_model + self.name_dataset +"_model.lf" self.path_list_classes = self.path_data + self.name_dataset + "_embedding.lf" self.size = size self.one_model = one_model if not os.path.isfile(self.path_model_w2v): raise Exception(self.path_model_w2v + " is not a file") if not os.path.isfile(self.path_list_classes): raise Exception(self.path_list_classes + " is not a file") self.list_cardinalities = []
[docs] def loading_files(self): """Load the data, the word2vec and the counter file. """ logger.info("Loading word2vec model: " + self.path_model_w2v) with open(self.path_model_w2v, "rb") as file_model: dict_local = pickle.load(file_model) self.w2v = dict_local["word2vec"] self.counter = dict_local["counter_patterns"] logger.info("Loading list of classes: " + self.path_list_classes) with h5py.File(self.path_list_classes, 'r') as file_h5py: if self.size != -1: self.list_classes = file_h5py['list_classes'][:self.size] else: self.list_classes = file_h5py['list_classes'][()]
[docs] def creating_cardinalities(self, min_cardinality=0, max_cardinality=float("+inf")): """Create the cardinality object for the learning step. Args: min_cardinality (int, optional): minimum value of cardinality to be selected. Defaults to 0. max_cardinality (float, optional): maximum value of cardinality to be selected. Defaults to float("+inf"). """ if self.one_model: list_cardinalities_available = [] for event in self.counter: cardinality = len(str(self.counter[event])) min_cardinality = 3 max_cardinality = 8 if cardinality > min_cardinality and cardinality < max_cardinality: list_cardinalities_available.append(cardinality) self.set_cardinalities_available = set(list_cardinalities_available) logger.info(str(len(self.set_cardinalities_available)) + " cardinalities available in this dataset") self.list_cardinalities.append(Cardinality(cardinality=0, path_w2v=self.path_model_w2v, path_list_classes=self.path_list_classes, size=self.size, one_model=self.one_model, set_cardinalities=self.set_cardinalities_available)) else: list_cardinalities_available = [] for event in self.counter: list_cardinalities_available.append(len(str(self.counter[event]))) self.set_cardinalities_available = set(list_cardinalities_available) logger.info(str(len(self.set_cardinalities_available)) + " cardinalities available in this dataset") for cardinality in self.set_cardinalities_available: if cardinality > min_cardinality and cardinality < max_cardinality: self.list_cardinalities.append(Cardinality(cardinality=cardinality, path_w2v=self.path_model_w2v, path_list_classes=self.path_list_classes, size=self.size))
[docs] def run(self) -> List[Cardinality]: """Start the workflow for the multithreading implementation Returns: List[Cardinality]: list of the cardinalities created """ self.loading_files() self.creating_cardinalities() return self.list_cardinalities