Source code for hal.ml.correlation
# -*- coding: utf-8 -*-
"""Correlate values in arrays producing fancy good-looking matrices"""
import os
import time
import numpy as np
from matplotlib import pyplot
from hal.charts import correlation as cr_plot
from hal.data.matrix import Matrix
from hal.files.models.files import Document
from hal.files.models.system import list_content, is_file
from hal.files.parsers import CSVParser
[docs]class CorrelationMatrix:
"""Common operations for a correlation matrix"""
def __init__(self, title, headers_to_test, headers, data):
"""
:param title: Title to show
:param headers_to_test: List of columns to get correlation matrix of
:param headers: List of all headers in matrix
:param data: Matri: of float values
"""
self.title = title
self.headers_to_test = headers_to_test
self.headers = headers
self.data = data
[docs] @staticmethod
def get_correlation_matrix(matrix):
"""Finds correlation matrix of matrix
:param matrix: List of features to get correlation matrix
:return: correlation matrix
"""
return np.corrcoef(matrix)
[docs] def show_correlation_matrix(self, correlation_matrix):
"""Shows the given correlation matrix as image
:param correlation_matrix: Correlation matrix of features
"""
cr_plot.create_correlation_matrix_plot(
correlation_matrix, self.title, self.headers_to_test
)
pyplot.show()
[docs] def show_correlation_matrix_from_columns(self):
"""Shows the correlation matrix of columns"""
correlation_matrix = self.get_correlation_matrix_from_columns()
self.show_correlation_matrix(correlation_matrix)
[docs] def get_correlation_matrix_from_columns(self):
"""Computes correlation matrix of columns
:return: Correlation matrix of columns
"""
header_to_column = {} # create index of headers
for header in self.headers:
header_to_column[header] = self.headers.index(header)
data_to_test = []
for header in self.headers_to_test:
header_column = Matrix(self.data) \
.get_column(header_to_column[header])
for i, value in enumerate(header_column):
header_column[i] = float(value) # get float
data_to_test.append(header_column)
return self.get_correlation_matrix(data_to_test)
[docs] def save_to_file(self, out_file):
"""Saves correlation matrix of selected headers
:param out_file: Output file
"""
correlation_matrix = self.get_correlation_matrix_from_columns()
cr_plot.create_correlation_matrix_plot(
correlation_matrix, self.title, self.headers_to_test)
fig = pyplot.gcf() # get reference to figure
fig.set_size_inches(23.4, 23.4)
pyplot.savefig(out_file, dpi=120)
[docs] @staticmethod
def save_correlation_matrix_from_folder(folder_path):
"""Saves each file's correlation matrix of common headers
:param folder_path: Folder containing logs data
"""
file_name = "output-" + str(int(time.time()))
output_folder = os.path.join(folder_path, file_name)
os.makedirs(output_folder) # make necessary folders to create directory
for file in list_content(folder_path, False, False):
if is_file(file) and str(file).endswith("csv"):
print("Analysing file ", str(file))
file_name = Document(file).name.strip()
output_file_name = file_name + ".png" # save output as image
output_file_path = os.path.join(output_folder, output_file_name)
headers, data = CSVParser.get_headers_data(file) # parse
matrix = CorrelationMatrix(
"Correlation of logs data for file " + file_name,
headers,
headers,
data
)
matrix.save_to_file(output_file_path)