stable version 准备接入不同设备训练不同模型

This commit is contained in:
user 2024-11-07 14:37:51 +08:00
parent cfac5fd675
commit 6e333fe733
66 changed files with 2650 additions and 591 deletions

3
.gitignore vendored
View File

@ -1,5 +1,4 @@
dataset
rawdata
old_rawdata
labels
logs

View File

@ -1,5 +1,15 @@
# Change Log
## v0.1.0.20240910_alpha
## v0.0.2.20241015_alpha
# [⭐Features]
- 从 MySQL 数据库获取数据
# [🔄Changed]
- 变更数据处理流程,先从数据库获取数据划分训练集测试集,得到中间文件,再进行选择空间点等后续处理
## v0.0.1.20240910_alpha
- 跑通

15
TODO.md
View File

@ -0,0 +1,15 @@
-[ ] 调研所有可调整超参数
-[ ] 调研所有回归任务的 loss 设计
-[ ] 调研所有可以用以回归任务的模型架构,及其可调整参数
-[ ] 调研跳出局部最优解的方法
-[x] 将数据库中的数据转化成训练集,验证集,测试集
-[x] 将训练集,验证集用不同的预处理、模型、超参数得到最优的模型
-[x] 将最优模型极其对应的超参数保存,并验证测试集得到测试误差
-[x] 不同的参数使用不同的模型

View File

@ -5,16 +5,16 @@ defaults:
task_name: main
raw_spectral_data_dir: /code/admin/20240806-NanEr-5-8-data/rawdata
raw_labels_dir: /code/admin/20240806-NanEr-5-8-data/labels/NanEr
dataset_dir: /code/admin/20240806-NanEr-5-8-data/dataset
labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
# raw_spectral_data_dir: /data/SEMS-model-training/rawdata
# raw_labels_dir: /data/SEMS-model-training/labels/NanEr
dataset_dir: /data/SEMS-model-training/dataset
# labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
#是
dataset:
train_ratio: 0.8
validate_ratio: 0.1
num_worker: 128
# train_ratio: 0.8
# validate_ratio: 0.1
num_worker: 0
train:
max_epoch: 10000
max_epoch: 5000
checkpoint_interval: 100

View File

@ -1,40 +1,37 @@
run:
num_samples: 1
resources_per_trial:
cpu: 128
gpu: 1
cpu: 4
gpu: 0.1
scheduler:
_target_: ray.tune.schedulers.ASHAScheduler
metric: val_loss
mode: min
max_t: 20000
grace_period: 1
reduction_factor: 2
_target_: ray.tune.schedulers.FIFOScheduler
# _target_: ray.tune.schedulers.ASHAScheduler
# metric: val_loss
# mode: min
# max_t: 20000
# grace_period: 1
# reduction_factor: 2
config:
choose_frame_spatial:
_target_: ray.tune.grid_search
values:
- _target_: data.choose_frame_spatial.mean.ChooseFrameSpatial
interval: [-3,0]
- _target_: data.choose_frame_spatial.DBSCAN.ChooseFrameSpatial ##DBSCAN_data_augmentation
features_scaling:
_target_: ray.tune.grid_search
values:
- _target_: data.features_scaling.max_min.FeatureScaling
- _target_: data.features_scaling.standardization.FeatureScaling #
labels_scaling:
_target_: ray.tune.grid_search
values:
- _target_: data.labels_scaling.max_min.LabelScaling
- _target_: data.labels_scaling.standardization.LabelScaling #standardization
model:
_target_: ray.tune.grid_search
values:
# - _target_: model.FCNN.FCNN
# - _target_: model.CNN_LSTM_FCNN.CNN_LSTM_FCNN
- _target_: model.CNN1D_FCNN.CNN1D_FCNN
- _target_: model.DRSN-CW.SpectralModel
# - _target_: model.FCNN_DBSCAN_small.SpectralModel
criterion:
_target_: ray.tune.grid_search
values:
@ -42,10 +39,15 @@ config:
optimizer:
_target_: ray.tune.grid_search
values:
- _target_: torch.optim.Adam
_partial_: true
lr: 0.000001
- _target_: torch.optim.Adam
_partial_: true
lr: 0.0001
batch_size:
_target_: ray.tune.grid_search
values:
- 1024
- 128

View File

@ -0,0 +1,142 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
from sklearn.cluster import DBSCAN
from pathlib import Path
class ChooseFrameSpatial:
def __init__(self,eps=0.15,min_samples=10):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
"""
self.eps=eps
self.min_samples=min_samples
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改
self.file_path = Path(__file__).absolute()
def load_state_dict(self,state):
self.eps=state["eps"]
self.min_samples=state["min_samples"]
def state_dict(self):
return {"eps":self.eps,"min_samples":self.min_samples}
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
"""
获取指定时间的内的光谱数据
"""
if isinstance(measureStartDatetime.item(),str):
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间此项目中时间戳均以ms为单位
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000 #测量开始时间+3秒
else:
start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000
end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000
start_index=0
end_index=timestamps.shape[0]
for i in range(1,timestamps.shape[0]):
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
# print(f"开始索引为{i}")
start_index=i
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
# print(f"结束索引为{i}")
end_index=i
return rawdata[start_index:end_index,...]
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
rawSpectralData=rawSpectralData.transpose(0, 2, 1)
rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed)
labels_norm = db_norm.labels_
n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
if n_norm==0:
return None
max_i=0
max_num=0
for i in range(n_norm):
tmp=(labels_norm==i).sum()
if tmp>max_num:
max_i=i
max_num=tmp
selected_data=rawSpectralData_normed[labels_norm==max_i,:]
selected_data=np.mean(selected_data,axis=0)
# space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
# space_num= 10 #空间上平均光强合适且方差最小的10个点
# light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
# & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
# )
# if light_indices[0].shape[0] < space_num:
# print('No Enough Usable Space Point Found!')
# return None
# intensity_data = selected_data[:,:,light_indices[0]]
# # spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
# # top_10_indices = np.argsort(spatial_variance)[:space_num]
# space_selected_data = intensity_data[:,:,:]
# space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0)
# space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data))
# # Savitzky-Golay filter
# data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0)
# spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11,
# 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41,
# 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31,
# 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140,
# 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
# selected_data = data_filtered[spec_indices]
return selected_data
if __name__ =="__main__":
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True)
print(raw_data.keys())
print(f"炉次号\t{raw_data["furnaceNumber"]}")
print(f"开始测量时间\t{raw_data["measureStartDatetime"]}")
print(f"结束测量时间\t{raw_data["measureEndDatetime"]}")
print(f"时间戳维度\t{raw_data["timestamps"].shape}")
print(f"原始光谱数据维度\t{raw_data["rawSpectralData"].shape}")
tmp=ChooseFrameSpatial()
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")

View File

@ -0,0 +1,102 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from scipy.signal import savgol_filter
from sklearn.cluster import DBSCAN
class ChooseFrameSpatial:
def __init__(self,eps=0.15,min_samples=10):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
"""
self.eps=eps
self.min_samples=min_samples
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
"""
获取指定时间的内的光谱数据
"""
if isinstance(measureStartDatetime.item(),str):
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间此项目中时间戳均以ms为单位
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000 #测量开始时间+3秒
else:
start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000
end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000
start_index=0
end_index=timestamps.shape[0]
for i in range(1,timestamps.shape[0]):
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
# print(f"开始索引为{i}")
start_index=i
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
# print(f"结束索引为{i}")
end_index=i
return rawdata[start_index:end_index,...]
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
rawSpectralData=rawSpectralData.transpose(0, 2, 1)
rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed)
labels_norm = db_norm.labels_
n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
if n_norm==0:
return None
max_i=0
max_num=0
for i in range(n_norm):
tmp=(labels_norm==i).sum()
if tmp>max_num:
max_i=i
max_num=tmp
selected_data=rawSpectralData_normed[labels_norm==max_i,:]
# selected_data=np.mean(selected_data,axis=0)
return selected_data
if __name__ =="__main__":
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True)
print(raw_data.keys())
print(f"炉次号\t{raw_data["furnaceNumber"]}")
print(f"开始测量时间\t{raw_data["measureStartDatetime"]}")
print(f"结束测量时间\t{raw_data["measureEndDatetime"]}")
print(f"时间戳维度\t{raw_data["timestamps"].shape}")
print(f"原始光谱数据维度\t{raw_data["rawSpectralData"].shape}")
tmp=ChooseFrameSpatial()
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")

View File

@ -1,100 +0,0 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
class ChooseFrameSpatial:
def __init__(self, interval=[-3,0]):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
Args:
interval (list, optional): 此方法依赖的光谱时间范围默认[-30,30]即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
"""
self.description=f"{str(self.__class__).split(".")[2]}_{interval[0]}_{interval[1]}"
# print(self.description)
self.interval=interval
def get_data_interval(self, start_time, end_time ):
return start_time+datetime.timedelta(seconds=self.interval[0]),start_time+datetime.timedelta(seconds=self.interval[1])
def run(self,timestamps,rawdata):
############# 空间数据压缩 ######################
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
space_num= 10 #选取方差最小的点的个数
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
)
if light_indices[0].shape[0] < space_num:
print('No Enough Usable Space Point Found!')
return None
intensity_data = rawdata[:,:,light_indices[0]]
spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
top_10_indices = np.argsort(spatial_variance)[:space_num]
space_selected_data = intensity_data[:, :, top_10_indices]
############# 时间数据压缩 ######################
# framerate = 24
# timewindow = 2
# time_data_intensity = np.sum(np.mean(space_selected_data,axis=2),axis=1)
# min_var = float('inf')
# min_index = 0
# for i in range(len(time_data_intensity)-framerate*timewindow-1):
# window_var = np.var(time_data_intensity[i:i+framerate*timewindow])
# if window_var < min_var:
# min_var = window_var
# min_index = i
# selected_data = space_selected_data[min_index:min_index+framerate*timewindow,:,:]
selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
# print("timewindow_begin=",min_index)
# if (result_dir is not None) and (filenum is not None) :
# for i in range (selected_data.shape[2]):
# Z=selected_data[:,:,i]
# x=np.linspace(400,1000,Z.shape[1])
# y=selected_data[:,1,i]
# x, y = np.meshgrid(x, y)
# fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
# surf = ax.plot_surface(x, y, Z, cmap=cm.Blues,
# linewidth=0, antialiased=False)
# ax.view_init(elev=30, azim=-60) # 更改视角
# ax.set_zlim(0, 4095)
# ax.set_zlabel("Light Intensity")
# ax.set_xlabel("Spectral Band(nm)")
# ax.set_ylabel("Time (Serial Number)")
# plt.savefig(f"{result_dir}{ filenum} file {i}-th spatial_point_line.png")
# plt.close()
return selected_data
if __name__ =="__main__":
timpestamp=np.zeros((600,))
input_data=np.random.random((600,224,512))*4095
tmp=ChooseFrameSpatial()
output=tmp.run(timpestamp,input_data)
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")

View File

@ -0,0 +1,125 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from scipy.signal import savgol_filter
from scipy.ndimage import median_filter
class ChooseFrameSpatial:
def __init__(self):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
"""
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改
print(self.description)
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
"""
获取指定时间的内的光谱数据
"""
if isinstance(measureStartDatetime.item(),str):
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 #测量开始时间此项目中时间戳均以ms为单位
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 3000 #测量开始时间+3秒
else:
start_timestamp=measureStartDatetime.item().timestamp()*1000
end_timestamp=measureStartDatetime.item().timestamp()*1000 + 3000
start_index=0
end_index=timestamps.shape[0]
for i in range(1,timestamps.shape[0]):
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
# print(f"开始索引为{i}")
start_index=i
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
# print(f"结束索引为{i}")
end_index=i
return rawdata[start_index:end_index,...]
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
space_num= 10 #空间上平均光强合适且方差最小的10个点
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
)
if light_indices[0].shape[0] < space_num:
print('No Enough Usable Space Point Found!')
return None
intensity_data = selected_data[:,:,light_indices[0]]
# spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
# top_10_indices = np.argsort(spatial_variance)[:space_num]
time_window = 11
time_steps, wavelengths, spatial_points = intensity_data.shape
filtered_data = np.zeros_like(intensity_data)
for wavelength in range(wavelengths):
for spatial_point in range(spatial_points):
# 提取时间序列
time_series = intensity_data[:, wavelength, spatial_point]
# 应用中值滤波
filtered_time_series = median_filter(time_series, size=time_window)
# 将滤波后的时间序列放回原位置
filtered_data[:, wavelength, spatial_point] = filtered_time_series
space_selected_data = filtered_data[:,:,:]
space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0)
# space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data))
# # Savitzky-Golay filter
# data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0)
# spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11,
# 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41,
# 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31,
# 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140,
# 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
# selected_data = data_filtered[spec_indices]
# print(space_selected_data.shape)
return space_selected_data
# return selected_data
if __name__ =="__main__":
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True)
print(raw_data.keys())
print(f"炉次号\t{raw_data["furnaceNumber"]}")
print(f"开始测量时间\t{raw_data["measureStartDatetime"]}")
print(f"结束测量时间\t{raw_data["measureEndDatetime"]}")
print(f"时间戳维度\t{raw_data["timestamps"].shape}")
print(f"原始光谱数据维度\t{raw_data["rawSpectralData"].shape}")
tmp=ChooseFrameSpatial()
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")

View File

@ -0,0 +1,83 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
class ChooseFrameSpatial:
def __init__(self,):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
"""
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改
print(self.description)
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
"""
获取指定时间的内的光谱数据
"""
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间-1秒此项目中时间戳均以ms为单位
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 +1000 #测量开始时间+1秒
for i in range(1,timestamps.shape[0]):
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
# print(f"开始索引为{i}")
start_index=i
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
# print(f"结束索引为{i}")
end_index=i
return rawdata[start_index:end_index,...]
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
space_num= 10 #空间上平均光强合适且方差最小的10个点
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
)
if light_indices[0].shape[0] < space_num:
print('No Enough Usable Space Point Found!')
return None
intensity_data = selected_data[:,:,light_indices[0]]
spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
top_10_indices = np.argsort(spatial_variance)[:space_num]
selected_data = intensity_data[:, :, top_10_indices]
return selected_data
if __name__ =="__main__":
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True)
print(raw_data.keys())
print(f"炉次号\t{raw_data["furnaceNumber"]}")
print(f"开始测量时间\t{raw_data["measureStartDatetime"]}")
print(f"结束测量时间\t{raw_data["measureEndDatetime"]}")
print(f"时间戳维度\t{raw_data["timestamps"].shape}")
print(f"原始光谱数据维度\t{raw_data["rawSpectralData"].shape}")
tmp=ChooseFrameSpatial()
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")

View File

@ -1,132 +0,0 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
class ChooseFrameSpatial:
def __init__(self, interval=[-30,30],overexposion_threshold = 1,weight_mean_var = 0.5,space_num= 10, time_num = 9,framerate = 4,timewindow_time = 1,Type=6):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
Args:
interval (list, optional): 此方法依赖的光谱时间范围默认[-30,30]即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
"""
# print(str(self.__class__).split("."))
self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
# print(self.description)
self.overexposion_threshold = overexposion_threshold #过曝阈值:有多少时间点过曝时,判定为这个空间点过曝
self.weight_mean_var =weight_mean_var #强度/方差选取权重默认平均光强权重为1此权重为方差权重取值为[0
self.space_num= space_num #选取空间点数量
self.time_num = time_num #选取时间点数量
self.framerate = framerate #采样帧率
self.timewindow = framerate * timewindow_time #选取时间窗口为1s
self.interval=interval
self.Type=Type
def get_data_interval(self, start_time, end_time ):
return start_time+datetime.timedelta(seconds=self.interval[0]),end_time+datetime.timedelta(seconds=self.interval[1])
def Overexposed_spatial_point_detection(self,inputdata, timethreshold):
#判别一个空间点是否过曝inputdata是timewavelength二维数组timethreshold是时间阈值有多少个时间点过曝则认为这个点过曝
# 如果过曝返回False不过曝返回True
row_max_values= np.max(inputdata, axis=1)
overexposed_rows = np.sum(row_max_values >= 4095)
if overexposed_rows > timethreshold:
return False
else:
return True
def run(self,timestamps,rawdata):
# 降维方法空间上选取强度达到一定的阈值且方差最小的点时间上把整个时间段划分并选取各段权重最高的1s进行平均Type=1取前1/3Type=2取中部1/3Type=3取最后1/3Type=4取全部
############# 空间过曝点去除 ######################
unoverposed_indices = [i for i in range(rawdata.shape[2]) if self.Overexposed_spatial_point_detection(rawdata[:, :, i],self.overexposion_threshold)]
rawdata = rawdata[:,:,unoverposed_indices]
############# 空间数据压缩 ######################
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
space_intensity_var = np.var(np.sum(rawdata, axis=1), axis=0)
combined_score = (space_intensity_mean - np.min(space_intensity_mean)) / (np.max(space_intensity_mean) - np.min(space_intensity_mean)) \
- self.weight_mean_var * (space_intensity_var - np.min(space_intensity_var)) / (np.max(space_intensity_var) - np.min(space_intensity_var))
sorted_indices = np.argsort(combined_score)[::-1]
space_selected_data = rawdata[:,:,sorted_indices[:self.space_num]]
############# 时间数据压缩 ######################
space_selected_data_intensity = np.sum(space_selected_data,axis=1)
#按照时间把强度数据拆分成num_time份每一份维度为总时间点数/time_num, space_num
time_length = space_selected_data_intensity.shape[0]
chunk_size = time_length // self.time_num
remainder = time_length % self.time_num
start_index = 0
time_selected_data =None
for i in range(self.time_num):
end_index = start_index + chunk_size + (1 if i < remainder else 0)
chunk = space_selected_data_intensity[start_index:end_index, :]
window_var = np.empty(0)
window_mean = np.empty(0)
for j in range(len(chunk)-self.timewindow-1):
window_var = np.concatenate((window_var, np.expand_dims( np.sum(np.var(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
window_mean = np.concatenate((window_mean, np.expand_dims( np.sum(np.mean(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
combined_score_time = (window_mean - np.min(window_mean)) / (np.max(window_mean) - np.min(window_mean)) \
- self.weight_mean_var * (window_var - np.min(window_var)) / (np.max(window_var) - np.min(window_var))
sorted_indices = np.argsort(combined_score_time)[::-1]
time_window_data = np.mean(space_selected_data[start_index+sorted_indices[0]:start_index+sorted_indices[0]+self.timewindow, :, :],axis=0)
# print(time_selected_data.shape,time_window_data.shape,np.expand_dims( time_window_data,0).shape)
if time_selected_data is None:
time_selected_data=np.expand_dims( time_window_data,0)
else:
time_selected_data = np.concatenate((time_selected_data, np.expand_dims( time_window_data,0)), axis=0)
start_index = end_index
if self.Type == 1:
print("time_selected_data[前1/3].shape: ", time_selected_data[:self.time_num//3,:,:].shape)
return time_selected_data[:self.time_num//3,:,:]
elif self.Type == 2:
print("time_selected_data[中1/3].shape: ", time_selected_data[self.time_num//3:2*self.time_num//3,:,:].shape)
return time_selected_data[self.time_num//3:2*self.time_num//3,:,:]
elif self.Type == 3:
print("time_selected_data[后1/3].shape: ", time_selected_data[2*self.time_num//3:,:,:].shape)
return time_selected_data[2*self.time_num//3:,:,:]
elif self.Type == 4:
print("time_selected_data[前2/3].shape: ", time_selected_data[:2*self.time_num//3,:,:].shape)
return time_selected_data[:2*self.time_num//3,:,:]
elif self.Type == 5:
print("time_selected_data[后2/3].shape: ", time_selected_data[self.time_num//3:,:,:].shape)
return time_selected_data[self.time_num//3:,:,:]
elif self.Type == 6:
print("time_selected_data.shape: ", time_selected_data.shape)
return time_selected_data
else:
print("Type is not 1, 2, 3, 4, 5 or 6 !")
return None
if __name__ =="__main__":
timpestamp=np.zeros((600,))
input_data=np.random.random((600,224,512))*4095
tmp=ChooseFrameSpatial()
output=tmp.run(timpestamp,input_data)
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")

View File

@ -0,0 +1,78 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
from scipy.signal import savgol_filter
class ChooseFrameSpatial:
def __init__(self,):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
Args:
interval (list, optional): 此方法依赖的光谱时间范围默认[-30,30]即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
"""
# tmp=
# if len(tmp)>2:
# self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}"
print(self.description)
# print(self.description)
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
############# 空间数据压缩 ######################
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
space_num= 10 #选取方差最小的点的个数
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
)
if light_indices[0].shape[0] < space_num:
print('No Enough Usable Space Point Found!')
return None
intensity_data = rawdata[:,:,light_indices[0]]
spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
top_10_indices = np.argsort(spatial_variance)[:space_num]
space_selected_data = intensity_data[:, :, top_10_indices]
selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
selected_data = savgol_filter(selected_data, window_length=11, polyorder=2, axis=0)
top_features = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136,
11, 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204,
85, 104, 41, 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18,
17, 96, 167, 192, 201, 31, 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51,
147, 58, 182, 49, 119, 13, 179, 140, 105, 45, 55, 33, 73, 111, 97, 194, 121, 89,
38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
selected_data=selected_data[top_features]
return selected_data
if __name__ =="__main__":
timpestamp=np.zeros((600,))
input_data=np.random.random((56,224,512))*4095
tmp=ChooseFrameSpatial()
output=tmp.run(0,0,timpestamp,input_data)
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")

View File

@ -0,0 +1,21 @@
'''
@File : max_min.py
@Time : 2024/08/12 14:02:59
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 对输入特征进行标准化放缩
'''
import numpy as np
class FeatureScaling:
def __init__(self):
self.description=f"{str(self.__class__).split(".")[-2]}"
self.flag_get_global_info=False
def run( self,feature):
return feature

View File

@ -0,0 +1,23 @@
'''
@File : max_min.py
@Time : 2024/08/12 14:02:59
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 对输入特征进行标准化放缩
'''
import numpy as np
class FeatureScaling:
def __init__(self):
self.description=f"{str(self.__class__).split(".")[-2]}"
self.flag_get_global_info=False
def run( self,feature):
feature=(feature-feature.min())/(feature.max()-feature.min())
return feature

View File

@ -0,0 +1,72 @@
'''
@File : max_min.py
@Time : 2024/08/12 14:03:38
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 对输出标签进行标准化放缩
'''
import numpy as np
from pathlib import Path
class FeatureScaling:
def __init__(self):
self.description=f"{str(self.__class__).split(".")[-2]}"
self.flag_get_global_info=True
self._min=None
self._max=None
self._mean=None
self._var=None
self._num=0
self.file_path = Path(__file__).absolute()
def load_state_dict(self,state):
self._min=state["_min"]
self._max=state["_max"]
self._mean=state["_mean"]
self._var=state["_var"]
self._num=state["_num"]
def state_dict(self):
return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num}
def get_global_info(self,feature):
feature=feature.astype(np.float32)
if self._max is None:
self._min=np.zeros(feature.shape,dtype=np.float32)+999999999
self._max=np.zeros(feature.shape,dtype=np.float32)
self._mean=np.zeros(feature.shape,dtype=np.float32)
self._var=np.zeros(feature.shape,dtype=np.float32)
self._num=0
self.feature_shape=feature.shape
self._min=np.minimum(self._min,feature)
self._max=np.maximum(self._max,feature)
if self._num>0:
self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(feature-self._mean)*(feature-self._mean)
#要先更新var再更新mean
self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*feature
self._num =self._num+1
# print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max)
def run(self,feature):
# for i in range(label.shape[0]):
# label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
feature=(feature-self._mean)/np.sqrt(self._var)
return feature
# def reverse(self,feature):
# # for i in range(labels.shape[1]):
# # labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
# labels=labels*np.sqrt(self._var)+self._mean
# return labels

View File

@ -0,0 +1,202 @@
import os
import shutil
from pathlib import Path
import pymysql.cursors
import numpy as np
import gzip
import msgpack
# oldRawDatasetDir=
def getLatestData(dataNum,deviceId='DOJHBG',measureType="TSC"):
connection = pymysql.connect(host='localhost',
user='root',
password='Ghs@2211',
database='SEMS',
cursorclass=pymysql.cursors.DictCursor)
with connection:
with connection.cursor() as cursor:
# 定义 SQL 查询
sql = """
SELECT
f.furnaceNumber,
f.measureStartDatetime,
f.measureEndDatetime,
f.Temperature,
f.C,
f.P,
f.S,
f.Mn,
f.Ni,
f.Mo,
f.Cr,
ot.spectralData,
ot.spectralDim,
ot.spatialDim
FROM
furnace f
JOIN
online_test ot ON f.furnaceId = ot.furnaceId
WHERE
f.deviceId = %s AND
f.measureType = %s
ORDER BY
f.furnaceID DESC
LIMIT %s;
"""
values=(deviceId,measureType,dataNum)
cursor.execute(sql,values)
result = cursor.fetchall()
return result
def getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC"):
connection = pymysql.connect(host='localhost',
user='root',
password='Ghs@2211',
database='SEMS',
cursorclass=pymysql.cursors.DictCursor)
dataNum=0
with connection:
with connection.cursor() as cursor:
# 定义 SQL 查询
sql = """
SELECT
COUNT(*)
FROM
furnace as f
WHERE
f.deviceId = %s AND
f.measureType = %s
"""
values=(deviceId,measureType)
cursor.execute(sql,values)
result = cursor.fetchall()
dataNum=result[0]["COUNT(*)"]
sql = """
SELECT furnaceNumber
FROM furnace as f
WHERE
f.deviceId = %s AND
f.measureType = %s
ORDER BY furnaceId DESC
LIMIT 1;
"""
values=(deviceId,measureType)
cursor.execute(sql,values)
result = cursor.fetchall()
lastesFurnaceNumber=result[0]["furnaceNumber"]
return dataNum,lastesFurnaceNumber
def saveDataset(caches, datasetDir,datasetType="train"):
os.makedirs(datasetDir/datasetType)
for cache in caches:
print(type(cache))
if isinstance(cache,str):
oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset")
shutil.copy(oldRawDatasetDir/cache, datasetDir/datasetType)
if isinstance(cache,dict):
print(cache.keys())
spectralData=msgpack.unpackb(gzip.decompress(cache["spectralData"]))
# spectralData=msgpack.unpack(gzip.decompress(spectralDatawithTime["buffer"]))
spectralDataNumpy=np.frombuffer(spectralData["buffer"],dtype=np.uint16).reshape(int(len(spectralData["buffer"])/cache["spectralDim"]/cache["spatialDim"]/2),cache["spectralDim"],cache["spatialDim"])
np.savez(datasetDir/datasetType/f"{cache["furnaceNumber"]}.npz",furnaceNumber=cache["furnaceNumber"],measureStartDatetime=cache["measureStartDatetime"],measureEndDatetime=cache["measureEndDatetime"],timestamps=spectralData["timestamps"],rawSpectralData=spectralDataNumpy,rawLabels=np.array([cache["Temperature"],cache["C"],cache["P"],cache["S"],cache["Mn"],cache["Ni"],cache["Mo"],cache["Cr"],]),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"])
# break
def generateRawDataset(datasetDir="/data/SEMS-model-training/dataset/raw_dataset",datasetNum=1000,validationRate=0.2,testRate=0.1):
datasetDir=Path(datasetDir)
oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset")
oldRawDatasetFileNames=os.listdir(oldRawDatasetDir)
oldRawDatasetFileNames.sort()
oldRawDatasetNum=len(oldRawDatasetFileNames)
# latestFurnaceID=oldRawDatasetFileNames[-1]
DBdataNum,lastesFurnaceNumber=getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC")
if os.path.exists(datasetDir):
#如果存在数据集,则判断是否为最新
if os.path.exists(datasetDir/"validation_set"):
#"通过判断文件中的最新炉次号与已存在的数据集中的一不一样
valNames=os.listdir(datasetDir/"validation_set")
valNames.sort()
latestDatasetFurnaceID=valNames[-1]
if lastesFurnaceNumber==latestDatasetFurnaceID:
"如果数据集已经是最新的,那就直接返回,不进行任何操作"
return None
else:
shutil.rmtree(datasetDir)
else:
shutil.rmtree(datasetDir)
os.makedirs(datasetDir)
chooseDBdataNum=0
chooseOLDRawDataNum=0
if (DBdataNum+oldRawDatasetNum)>datasetNum:
if DBdataNum>=datasetNum:
chooseDBdataNum=datasetNum
chooseOLDRawDataNum=0
else:
chooseDBdataNum=DBdataNum
chooseOLDRawDataNum=datasetNum-DBdataNum
else:
chooseDBdataNum=DBdataNum
chooseOLDRawDataNum=oldRawDatasetNum
print(f"旧数据数量:{chooseOLDRawDataNum}, 新数据数量{chooseDBdataNum}")
if chooseDBdataNum>0:
#
DBDataset=getLatestData(chooseDBdataNum,deviceId='DOJHBG',measureType="TSC")
else:
DBDataset=[]
rawDatasetCache=oldRawDatasetFileNames[-chooseOLDRawDataNum:]+DBDataset
print(rawDatasetCache[-1].keys())
datasetNum=len(rawDatasetCache)
valNum=int(datasetNum*validationRate)
testNum=int(datasetNum*testRate)
trainNum=datasetNum-valNum-testNum
saveDataset(rawDatasetCache[:trainNum], datasetDir,datasetType="training")
saveDataset(rawDatasetCache[-testNum-valNum:-testNum], datasetDir,datasetType="validation")
saveDataset(rawDatasetCache[-testNum:], datasetDir,datasetType="test")
if __name__ =="__main__":
tmp=generateRawDataset()
# getMetadataFromMySQL()
# getLatestData(dataNum=1)

View File

@ -33,6 +33,7 @@ class LabelScaling:
self._min[i]=label[i]
def run(self,label):
for i in range(label.shape[0]):
label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
return label
@ -42,4 +43,3 @@ class LabelScaling:
return labels

View File

@ -0,0 +1,75 @@
'''
@File : max_min.py
@Time : 2024/08/12 14:03:38
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 对输出标签进行标准化放缩
'''
import numpy as np
from pathlib import Path
class LabelScaling:
def __init__(self):
self.description=f"{str(self.__class__).split(".")[-2]}"
self.flag_get_global_info=True
self._min=None
self._max=None
self._mean=None
self._var=None
self._num=0
self.file_path = Path(__file__).absolute()
def load_state_dict(self,state):
self._min=state["_min"]
self._max=state["_max"]
self._mean=state["_mean"]
self._var=state["_var"]
self._num=state["_num"]
def state_dict(self):
return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num}
def get_global_info(self,label):
label=label.astype(np.float32)
label_dim=label.shape[0]
if self._max is None:
self._min=np.zeros((label_dim),dtype=np.float32)+999999999
self._max=np.zeros((label_dim),dtype=np.float32)
self._mean=np.zeros((label_dim),dtype=np.float32)
self._var=np.zeros((label_dim),dtype=np.float32)
self._num=0
self._min=np.minimum(self._min,label)
self._max=np.maximum(self._max,label)
if self._num>0:
self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(label-self._mean)*(label-self._mean)
#要先更新var再更新mean
self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*label
self._num =self._num+1
# print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max)
# print("_var:",self._var)
def run(self,label):
# for i in range(label.shape[0]):
# label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
label=(label-self._mean)/np.sqrt(self._var)
return label
def reverse(self,labels):
# for i in range(labels.shape[1]):
# labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
labels=labels*np.sqrt(self._var)+self._mean
return labels

View File

@ -14,6 +14,8 @@ import numpy as np
import datetime
import pickle
import h5py
from pathlib import Path
from tqdm import tqdm
class DataPreProcess:
"""
@ -23,274 +25,85 @@ class DataPreProcess:
3. 将feature Label进行放缩
"""
def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,choose_frame_spatial, features_scaling,labels_scaling,train_ratio=0.8,validate_ratio=0.1) -> None:
def __init__(self, dataset_dir,choose_frame_spatial, features_scaling,labels_scaling) -> None:
"""初始化类,传入所有数据预处理需要的参数
Args:
raw_spectral_data_dir (_type_):原始光谱数据路径
raw_labels_dir (_type_): 原始标签路径
labels_name (_type_): 提取出的标签名
dataset_dir (_type_): 生成的数据集文件夹
choose_frame_spatial (_type_): 类对象进行光谱特征挑选
features_scaling (_type_): 类对象对输入的特征进行放缩使之直接输入神经网络
labels_scaling (_type_): 类对象对输出的的标签进行放缩
train_ratio (float, optional): 训练集比例. Defaults to 0.8.
validate_ratio (float, optional): 验证集比例剩余的即为测试集. Defaults to 0.1.
"""
self.raw_spectral_data_dir=raw_spectral_data_dir
self.raw_labels_dir=raw_labels_dir
self.dataset_dir=dataset_dir
self.labels_name=labels_name
self.dataset_dir=Path(dataset_dir)
self.choose_frame_spatial=choose_frame_spatial
self.features_scaling=features_scaling
self.labels_scaling=labels_scaling
self.train_ratio=train_ratio
self.validate_ratio=validate_ratio
self.labelNames=None
#加载原始的标签
self.raw_labels=self._load_raw_labels(raw_labels_dir,labels_name)
# _raw_data_2_dataset(self)
self.run()
#加载原始光谱的缓存
self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
def _raw_data_2_pre_dataset(self,pre_dataset_save_dir,dataset_type="test"):
#随便加载一个csv文件判断光谱数据的维度
self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
rawdata_names= os.listdir(self.dataset_dir/"raw_dataset"/dataset_type)
if not os.path.exists(pre_dataset_save_dir/dataset_type):
os.makedirs(pre_dataset_save_dir/dataset_type)
print(f"[预处理]: {dataset_type}数据集选择特征时间与空间点 ")
for name in tqdm(rawdata_names):
#正式开始原始数据转化为数据集
self._raw_data_2_dataset()
tmp_data=np.load(self.dataset_dir/"raw_dataset"/dataset_type/name,allow_pickle=True)
raw_spectral_data=self.choose_frame_spatial.run(tmp_data["measureStartDatetime"],tmp_data["measureEndDatetime"],tmp_data["timestamps"],tmp_data["rawSpectralData"])
flag_data_augmentation=False
################################
#此段代码是为数据增强raw_spectral_data是二维数据代表多个光谱曲线使每个光谱曲线都对应相同的label。
def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
"""读取利用脚本处理后的钢厂给的excel文件所在文件夹会扫描所有文件
并选择指定的label名作为output
并去除值为NaN或0的行
Args:
raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
labels_name (list): 指定的作为Label的列
Returns:
pd.DataFrame: 返回所有筛选后的炉次数据
"""
raw_labels=None
for name in os.listdir(raw_labels_dir):
tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
choosed_column=["TSC_start_time","TSC_end_time"]
choosed_column=choosed_column+labels_name
#只选出我们想要的部分作为标签
tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
# 选出有NULL的行
null_rows=tmp_raw_labels.isnull().any(axis=1)
# # 选出有0的行
zeros_rows=(tmp_raw_labels==0).any(axis=1) | (tmp_raw_labels=='0').any(axis=1)
# # 每一行但凡有NULL或者0都给删了
selected_rows=~(null_rows|zeros_rows)
tmp_raw_labels=tmp_raw_labels[selected_rows]
if raw_labels is None:
raw_labels=tmp_raw_labels
else:
raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
return raw_labels
def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
"""生成所有原始光谱数据文件的缓存,包括每个文件记录的开始及结束时间,目的为加快后面读取原始数据的速度
Args:
raw_spectral_data_dir (str): 原始光谱所在路径
Returns:
list: 缓存其中有多少个成员就有多少个原始数据文件每个成员包括格式为datetime的开始及结束时间及文件路径
"""
spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
update_flag=False
#不存在缓存文件,以及缓存文件中数据文件个数与文件夹中的文件个数不一致,均重新生成
if len(cache_file_paths)==0:
logging.debug(f"Raw spectral data cache is not existed! Generating")
update_flag=True
elif len(cache_file_paths)==1:
with open(cache_file_paths[0],"rb") as f:
raw_spectral_data_cache=pickle.load(f)
if len(raw_spectral_data_cache) !=len(spectral_file_paths):
logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
update_flag=True
else:
logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
if update_flag:
spectral_file_paths.sort()
raw_spectral_data_cache=[]
for file in spectral_file_paths:
tmp_info={}
tmp_data=np.loadtxt(file, delimiter=",")
start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
tmp_info["start_t"]=start_t
tmp_info["end_t"]=end_t
tmp_info["file_path"]=file
raw_spectral_data_cache.append(tmp_info)
with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
pickle.dump(raw_spectral_data_cache,f)
return raw_spectral_data_cache
def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
if data[0,2]==229376:
spectral_dim =224
spatial_dim=512
if data[0,2]==917504:
spectral_dim =448
spatial_dim=1024
return spectral_dim, spatial_dim
def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
"""获取从start_time到end_time的光谱数据
Args:
start_time (datetime.datetime): 开始时间
end_time (datetime.datetime): 结束时间
Returns:
np.ndarray: 原始光谱数据
"""
def get_spectral_data_per_file(file_path,s_t,e_t):
data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
if s_t is not None:
tmp_s=datetime.datetime.timestamp(s_t)*1000
tmp_s_index=0
for i in range(data.shape[0]-1):
if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
tmp_s_index=i
break
else:
tmp_s_index=0
if e_t is not None:
tmp_e=datetime.datetime.timestamp(e_t)*1000
tmp_e_index=data.shape[0]
for i in range(tmp_s_index,data.shape[0]-1):
if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
tmp_e_index=i
break
else:
tmp_e_index=data.shape[0]
with open(file_path[:-3]+"bin", "rb") as f:
f.seek(data[tmp_s_index,1])
d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
return data[tmp_s_index:tmp_e_index,0],d
timestamps=None
raw_spectral_data=None
for tmp_info in self.raw_spectral_data_cache:
tmp_data=None
if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and end_time<tmp_info["end_t"]:
# 目标时间段在交于此文件时间段的左侧。所以取从文件开始到end time的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
elif start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"]:
# 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
elif start_time>tmp_info["start_t"] and end_time<tmp_info["end_t"]:
# 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and end_time>tmp_info["end_t"]:
# 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
if tmp_data is not None:
if raw_spectral_data is None:
timestamps=tmp_time_stamp
raw_spectral_data=tmp_data
else:
timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
return timestamps,raw_spectral_data
def _raw_data_2_dataset(self):
save_dir=os.path.join(self.dataset_dir,self.choose_frame_spatial.description)
#第一步,进行特征时间与空间点选取,并保存
pre_dataset_save_dir=os.path.join(save_dir,"data")
if not os.path.exists(pre_dataset_save_dir):
os.makedirs(pre_dataset_save_dir)
#数据路径不存在就重新生成,如果存在,就跳过这部分。
# 因此需要额外注意如果有新数据需要把dataset下的文件夹都删除相当于删除缓存重新生成
for i in range(self.raw_labels.shape[0]):
start_time,end_time=self.choose_frame_spatial.get_data_interval(self.raw_labels.iloc[i]["TSC_start_time"],self.raw_labels.iloc[i]["TSC_end_time"])
timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
if flag_data_augmentation:
if raw_spectral_data is not None:
#获取到的数据帧率大于2才开始记录
if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
np.savez(os.path.join(pre_dataset_save_dir,f"{timestamps[0]}_{timestamps.shape[0]}.npz",),timestamps=timestamps,raw_spectral_data=raw_spectral_data,raw_labels=self.raw_labels.iloc[i][self.labels_name].to_numpy())
for i in range(raw_spectral_data.shape[0]):
np.savez(pre_dataset_save_dir/dataset_type/f"{name[:-4]}_{i}.npz",furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data[i,...],rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"])
###############################
else:
logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
if raw_spectral_data is not None:
np.savez(pre_dataset_save_dir/dataset_type/name,furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data,rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"])
#第二步,进行标准化,并形成数据集
#####################
self.dataset_save_dir=os.path.join(save_dir,f"{self.features_scaling.description}_{self.labels_scaling.description}")
def _pre_dataset_2_dataset(self,pre_dataset_save_dir,dataset_save_dir,dataset_type="test",savaFlag=True):
if not os.path.exists(self.dataset_save_dir):
os.makedirs(self.dataset_save_dir)
pre_dataset_files=os.listdir(pre_dataset_save_dir)
print(f"[预处理]: {dataset_type}数据集进行数据预处理")
pre_dataset_files=os.listdir(pre_dataset_save_dir/dataset_type)
if self.features_scaling.flag_get_global_info:
if dataset_type=="training" and (self.features_scaling.flag_get_global_info or self.labels_scaling.flag_get_global_info):
for name in pre_dataset_files:
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
self.features_scaling.get_global_info(tmp_data["raw_spectral_data"])
tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True)
if self.labelNames is None:
self.labelNames=tmp_data["labelNames"]
if self.features_scaling.flag_get_global_info:
self.features_scaling.get_global_info(tmp_data["rawSpectralData"])
if self.labels_scaling.flag_get_global_info:
self.labels_scaling.get_global_info(tmp_data["raw_labels"])
# 获取维度信息
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
feature_dim=tmp_data["raw_spectral_data"].shape
label_dim=tmp_data["raw_labels"].shape
self.labels_scaling.get_global_info(tmp_data["rawLabels"])
np.set_printoptions(suppress = True)
# print(f"[预处理]Label全局信息_num",self.labels_scaling._num,"_mean:",self.labels_scaling._mean,"_var:",self.labels_scaling._var,"_min:",self.labels_scaling._min,"_max:",self.labels_scaling._max)
#划分训练集_验证集_测试集
np.random.shuffle(pre_dataset_files)
[train_dateset_files,validate_dateset_files]=np.split(pre_dataset_files,[int(len(pre_dataset_files)*self.train_ratio)])
[validate_dateset_files,test_dateset_files]=np.split(validate_dateset_files,[int(len(pre_dataset_files)*self.validate_ratio)])
if savaFlag:
tmp_data=np.load(pre_dataset_save_dir/dataset_type/pre_dataset_files[0],allow_pickle=True)
feature_dim=tmp_data["rawSpectralData"].shape
label_dim=tmp_data["rawLabels"].shape
#写入HDF5文件
for dataset_name in ["train","validate","test"]:
if dataset_name=="train":
file_names=train_dateset_files
elif dataset_name=="validate":
file_names=validate_dateset_files
elif dataset_name=="test":
file_names=test_dateset_files
logging.info(f"Generating {dataset_name} dataset with {len(file_names)} samples")
with h5py.File(os.path.join( self.dataset_save_dir,f"{dataset_name}.h5"), 'w') as f:
with h5py.File(os.path.join( dataset_save_dir,f"{dataset_type}.h5"), 'w') as f:
h5_features = f.create_dataset(
'features',
tuple([len(file_names)]+list(feature_dim)),
tuple([len(pre_dataset_files)]+list(feature_dim)),
maxshape=(tuple([None]+list(feature_dim))),
chunks=tuple([1]+list(feature_dim)), # 手动设置 chunk 大小为一次数据的大小tuple([1]+list(feature_dim))
# compression='gzip',
@ -299,52 +112,104 @@ class DataPreProcess:
)
h5_labels = f.create_dataset(
'labels',
tuple([len(file_names)]+list(label_dim)),
tuple([len(pre_dataset_files)]+list(label_dim)),
chunks=tuple([1]+list(label_dim)), # 手动设置 chunk 大小为一次数据的大小
dtype=np.float32,
)
for i,name in enumerate(file_names):
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
for i,name in enumerate(pre_dataset_files):
tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True)
feature=self.features_scaling.run(tmp_data["raw_spectral_data"])
label=self.labels_scaling.run(tmp_data["raw_labels"])
feature=self.features_scaling.run(tmp_data["rawSpectralData"])
label=self.labels_scaling.run(tmp_data["rawLabels"])
h5_features[i]=feature.astype(np.float32)
h5_labels[i]=label.astype(np.float32)
else:
pre_dataset_files=os.listdir(pre_dataset_save_dir)
if self.labels_scaling.flag_get_global_info:
for name in pre_dataset_files:
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
self.labels_scaling.get_global_info(tmp_data["raw_labels"])
def run(self):
save_dir=self.dataset_dir/self.choose_frame_spatial.description
#第一步,进行特征时间与空间点选取,并保存
pre_dataset_save_dir=save_dir/"pre_dataset"
if not os.path.exists(pre_dataset_save_dir):
os.makedirs(pre_dataset_save_dir)
#数据路径不存在就重新生成,如果存在,就跳过这部分。
# 因此需要额外注意如果有新数据需要把dataset下的文件夹都删除相当于删除缓存重新生成
self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="training")
self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="validation")
self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="test")
else:
logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
#第二步,进行标准化,并形成数据集
self.dataset_save_dir=save_dir / f"{self.features_scaling.description}_{self.labels_scaling.description}"
if not os.path.exists(self.dataset_save_dir):
os.makedirs(self.dataset_save_dir)
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=True)
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=True)
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=True)
else:
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=False)
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=False)
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=False)
logging.info(f"Standardized Dataset is existed in {self.dataset_save_dir}")
def get_metric(self,outputs,labels):
# print("outputs Before",outputs)
outputs=self.labels_scaling.reverse(outputs)
# print("outputs After",outputs)
# print("labels Before",labels)
labels=self.labels_scaling.reverse(labels)
# print("labels After",labels)
error=outputs-labels
# print("outputs",outputs)
# print("labels",labels)
# print("errors",error)
if len(error.shape)==1:
hit_rate=np.zeros((1))
else:
hit_rate=np.zeros(error.shape[1])
for i,name in enumerate(self.labels_name):
# error[:,i]=outputs[:,i]-labels[:,i]
#["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
bounds=np.zeros(error.shape)
if name =="TSC_T":
bound=10
elif name=="TSC_C":
bound=0.05
elif name=="TSC_C_lab":
bound=0.05
elif name=="TSC_P_lab":
bound=0.0005
# ["Temperature","C","P","S","Mn","Ni","Mo","Cr"]
hit_rate[i]=((error[:,i]>=-bound) &(error[:,i]<=bound)).sum() /error.shape[0]
return error,hit_rate
for i,name in enumerate(self.labelNames):
if name =="Temperature":
bounds[:,i]=10.0+np.zeros(labels[:,i].shape)
elif name=="C":
bounds[:,i]=0.05+np.zeros(labels[:,i].shape)
elif name=="P":
bounds[:,i]=0.005+np.zeros(labels[:,i].shape)
elif name=="S":
bounds[:,i]=0.01+np.zeros(labels[:,i].shape)
elif name=="Mn":
bounds[:,i]=0.05+np.zeros(labels[:,i].shape)
elif name=="Ni":
bounds[:,i]=0.25*labels[:,i]
elif name=="Mo":
bounds[:,i]=0.25*labels[:,i]
elif name=="Cr":
bounds[:,i]=0.25*labels[:,i]
# if isinstance(bound, float):
# print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound}")
# else:
# print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound[0]}")
hit_rate[i]=((error[:,i]>=-bounds[:,i]) &(error[:,i]<=bounds[:,i])).sum() /error.shape[0]
return outputs, labels,error,hit_rate,bounds
@ -355,19 +220,13 @@ if __name__=="__main__":
logging.basicConfig(level = logging.DEBUG)
raw_data_dir="/code/admin/20240806-NanEr-5-8-data/rawdata"
labels_path="/code/admin/20240806-NanEr-5-8-data/labels/NanEr"
dataset_dir="/code/admin/20240806-NanEr-5-8-data/dataset"
labels_name=["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
datasetDir="/data/SEMS-model-training/dataset"
from choose_frame_spatial.mean import ChooseFrameSpatial
from features_scaling.max_min import FeatureScaling
from labels_scaling.max_min import LabelScaling
choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
from choose_frame_spatial.mean_CARS100_3 import ChooseFrameSpatial
from features_scaling.standardization import FeatureScaling
from labels_scaling.standardization import LabelScaling
choose_frame_spatial=ChooseFrameSpatial()
features_scaling=FeatureScaling()
labels_scaling=LabelScaling()
data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir,choose_frame_spatial,features_scaling,labels_scaling)
data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling)

View File

@ -4,10 +4,10 @@ import os
import time
def load_dataset(dataset_dir):
train_dataset=SpectralDataset(os.path.join(dataset_dir,"train.h5"))
val_dataset=SpectralDataset(os.path.join(dataset_dir,"validate.h5"))
return train_dataset, val_dataset
def load_dataset(dataset_dir,dataset_type="training"):
dataset=SpectralDataset(os.path.join(dataset_dir,f"{dataset_type}.h5"))
# val_dataset=SpectralDataset(os.path.join(dataset_dir,"validation.h5"))
return dataset
class SpectralDataset(torch.utils.data.Dataset):
def __init__(self, hdf5_path):
@ -17,8 +17,8 @@ class SpectralDataset(torch.utils.data.Dataset):
# rdcc_w0=0, # 缓存淘汰策略。值越接近 0优先淘汰最近最少使用的 chunk。值越接近 1优先淘汰已完全读取或写入的块
# rdcc_nslots=1e8, # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍,为达最佳性能需要 100 倍。默认为 521
)
self.features=self.hdf5_f["features"]
self.labels=self.hdf5_f["labels"]
self.features=self.hdf5_f["features"][...]
self.labels=self.hdf5_f["labels"][...]
def __len__(self):
return self.features.shape[0]
@ -26,10 +26,15 @@ class SpectralDataset(torch.utils.data.Dataset):
#构造Input
feature=self.features[idx]
label=self.labels[idx]
return feature, label
@ -37,7 +42,7 @@ class SpectralDataset(torch.utils.data.Dataset):
if __name__ =="__main__":
training_data=SpectralDataset("/home/admin/20240806-NanEr-5-8-data/dataset/mean_-30_30/max_min_max_min/train.h5")
training_data=SpectralDataset("/data/SEMS-model-training/dataset/ChooseFrameSpatial'>_-30_30/max_min_max_min/training.h5")
print("数据集大小为:",len(training_data))
print("输入Feature维度为", training_data[0][0].shape, "输出Label维度为",training_data[0][1].shape)

126
src/data/test.ipynb Normal file
View File

@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[预处理]: training数据集进行数据预处理\n",
"[预处理]Label全局信息_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n",
" 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n",
" 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n",
" 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n",
" 0.522 ]\n",
"[预处理]: validation数据集进行数据预处理\n",
"[预处理]Label全局信息_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n",
" 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n",
" 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n",
" 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n",
" 0.522 ]\n",
"[预处理]: test数据集进行数据预处理\n",
"[预处理]Label全局信息_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n",
" 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n",
" 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n",
" 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n",
" 0.522 ]\n"
]
}
],
"source": [
"from pre_process import DataPreProcess\n",
"\n",
"\n",
"datasetDir=\"/data/SEMS-model-training/dataset\"\n",
"\n",
"\n",
"from choose_frame_spatial.DBSCAN import ChooseFrameSpatial\n",
"from features_scaling.standardization import FeatureScaling\n",
"from labels_scaling.standardization import LabelScaling\n",
"choose_frame_spatial=ChooseFrameSpatial()\n",
"features_scaling=FeatureScaling()\n",
"labels_scaling=LabelScaling()\n",
"data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1591.5325 , 0.39140984, 0.04271309, 0.02187012,\n",
" 0.14065096, 0.02857196, 0.00202732, 0.10250819],\n",
" dtype=float32)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels_scaling._mean"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n",
"Scaled: [0.7486169269676385, 0.46089706984319634, 1.2160017876067477, 7.584260858761072, 2.26002739629723, 0.612572700572906, 0.21319305953525988, 13.385111606844243]\n",
"Reversed\t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n",
"Before \t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n"
]
}
],
"source": [
"labels= [1612.5325 , 0.450984, 0.067271309 , 0.05687012 , 0.2365096, 0.06357196 , 0.0034502732 , 0.450250819]\n",
"print(f\"Before: {labels}\")\n",
"tmp=labels_scaling.run(labels)\n",
"print(f\"Scaled: {list(tmp)}\")\n",
"labels_r=labels_scaling.reverse(tmp)\n",
"print(f\"Reversed\\t: {list(labels_r)}\")\n",
"print(f\"Before \\t: {labels}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "dl",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -5,6 +5,10 @@ import ray,os
import ray.tune
from functools import partial
import shutil
import subprocess
@ -24,6 +28,16 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None:
hydra_output_dir=hydra_output_dir['runtime']['output_dir']
command = f"""
sudo kill -9 $(sudo lsof -t -i :6006);
tensorboard --logdir {os.path.join(hydra_output_dir,"ray-tune")} --host 0.0.0.0 --port 6006
"""
# 创建子进程
tensorboard_subprocess = subprocess.Popen(command, shell=True, executable='/bin/bash',start_new_session=True)
#获取当前超参数配置
tune_cfg={}
for key in hydra_cfg.ray_tune.config.keys():
@ -44,10 +58,24 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None:
)
#保存最好的模型
best_trial = result.get_best_trial("val_loss", "min", "last")
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation loss: {best_trial.last_result['val_loss']}")
print(f"Best trial checkpoint path: {best_trial.checkpoint.path}")
shutil.copytree(best_trial.checkpoint.path, os.path.join(hydra_output_dir,"best-model"))
# tensorboard_subprocess.terminate()
# tensorboard_subprocess.wait()
# print("TensorBoard terminated")

160
src/model/DRSN-CW.py Normal file
View File

@ -0,0 +1,160 @@
import torch
import torch.nn as nn
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.shrinkage = Shrinkage(out_channels, gap_size=(1))
# residual function
self.residual_function = nn.Sequential(
nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
nn.BatchNorm1d(out_channels),
nn.ReLU(inplace=True),
nn.Conv1d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
nn.BatchNorm1d(out_channels * BasicBlock.expansion),
self.shrinkage
)
# shortcut
self.shortcut = nn.Sequential()
# the shortcut output dimension is not the same with residual function
# use 1*1 convolution to match the dimension
if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
self.shortcut = nn.Sequential(
nn.Conv1d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm1d(out_channels * BasicBlock.expansion)
)
def forward(self, x):
return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
# a = self.residual_function(x),
# b = self.shortcut(x),
# c = a+b
# return c
class Shrinkage(nn.Module):
def __init__(self, channel, gap_size):
super(Shrinkage, self).__init__()
self.gap = nn.AdaptiveAvgPool1d(gap_size)
self.fc = nn.Sequential(
nn.Linear(channel, channel),
nn.ReLU(inplace=True),
nn.Linear(channel, channel),
nn.Sigmoid(),
)
def forward(self, x):
x_raw = x
x = torch.abs(x)
x_abs = x
x = self.gap(x)
x = torch.flatten(x, 1)
# average = torch.mean(x, dim=1, keepdim=True) #CS
average = x #CW
x = self.fc(x)
x = torch.mul(average, x)
x = x.unsqueeze(2)
# soft thresholding
sub = x_abs - x
zeros = sub - sub
n_sub = torch.max(sub, zeros)
x = torch.mul(torch.sign(x_raw), n_sub)
return x
class SpectralModel(nn.Module):
def __init__(self, block=BasicBlock, num_block=[3, 4, 6, 3], num_classes=8):
super().__init__()
self.in_channels = 64
self.conv1 = nn.Sequential(
nn.Conv1d(1, 64, kernel_size=3, padding=1, bias=False),
nn.BatchNorm1d(64),
nn.ReLU(inplace=True))
# we use a different inputsize than the original paper
# so conv2_x's stride is 1
self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
self.avg_pool = nn.AdaptiveAvgPool1d((1))
# self.fc = nn.Linear(512 * block.expansion, 1024)
# self.fc = nn.Linear(1024 , 512)
# self.fc = nn.Linear(512, 512)
self.fc = nn.Sequential(
nn.BatchNorm1d(512),
nn.Linear(512 * block.expansion, 1024),
nn.ReLU(inplace=True),nn.BatchNorm1d(1024),
nn.Linear(1024 , 512),
nn.ReLU(inplace=True),nn.BatchNorm1d(512),
nn.Linear(512, 128),
nn.ReLU(inplace=True),nn.BatchNorm1d(128),
nn.Linear(128, 8),)
def _make_layer(self, block, out_channels, num_blocks, stride):
"""make rsnet layers(by layer i didnt mean this 'layer' was the
same as a neuron netowork layer, ex. conv layer), one layer may
contain more than one residual shrinkage block
Args:
block: block type, basic block or bottle neck block
out_channels: output depth channel number of this layer
num_blocks: how many blocks per layer
stride: the stride of the first block of this layer
Return:
return a rsnet layer
"""
# we have num_block blocks per layer, the first block
# could be 1 or 2, other blocks would always be 1
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x=torch.unsqueeze(x,1)
output = self.conv1(x)
output = self.conv2_x(output)
output = self.conv3_x(output)
output = self.conv4_x(output)
output = self.conv5_x(output)
output = self.avg_pool(output)
output = output.view(output.size(0), -1)
output = self.fc(output)
return output
# def rsnet18():
# """ return a RSNet 18 object
# """
# return RSNet(BasicBlock, [2, 2, 2, 2])
# def rsnet34():
# """ return a RSNet 34 object
# """
# return RSNet(BasicBlock, [3, 4, 6, 3])
if __name__=='__main__':
import torchinfo
model = SpectralModel()
model.eval()
# print(model)
input = torch.randn(64,224)
y = model(input)
print(y.size())
torchinfo.summary(model,input_size=(64,224))

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -3,9 +3,9 @@ import torch.nn.functional as F
import torchinfo
import torch
class CNN_FCNN(nn.Module):
class SpectralModel(nn.Module):
def __init__(self,):
super(CNN_FCNN, self).__init__()
super(SpectralModel, self).__init__()
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=512, kernel_size=3,stride=1)
self.Conv2d_2=nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3,stride=1)
@ -24,7 +24,7 @@ class CNN_FCNN(nn.Module):
self.fc_2=nn.Linear(in_features=2048,out_features=1024)
self.fc_3=nn.Linear(in_features=1024,out_features=512)
self.fc_4=nn.Linear(in_features=512,out_features=256)
self.fc_5=nn.Linear(in_features=256,out_features=4)
self.fc_5=nn.Linear(in_features=256,out_features=8)
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
@ -71,6 +71,6 @@ class CNN_FCNN(nn.Module):
return x
if __name__=="__main__":
model=CNN_FCNN()
model=SpectralModel()
torchinfo.summary(model,input_size=(64, 48, 224, 10))
torchinfo.summary(model,input_size=(64, 20, 224, 10))

View File

@ -0,0 +1,78 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torch
class SpectralModel(nn.Module):
def __init__(self,):
super(SpectralModel, self).__init__()
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3,stride=1)
self.Conv2d_2=nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3,stride=1)
# self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2)
self.Conv2d_3=nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3,stride=1)
self.Conv2d_4=nn.Conv2d(in_channels=16, out_channels=8, kernel_size=3,stride=1)
self.Conv2d_5=nn.Conv2d(in_channels=8, out_channels=4, kernel_size=2,stride=1)
self.flatten=nn.Flatten(start_dim=1, end_dim=3)
self.fc_1=nn.Linear(in_features=860,out_features=512) #2048->1024
self.fc_2=nn.Linear(in_features=512,out_features=256)
self.fc_3=nn.Linear(in_features=256,out_features=128)
self.fc_4=nn.Linear(in_features=128,out_features=64)
self.fc_5=nn.Linear(in_features=64,out_features=8)
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
def forward(self, x):
# input_shape=x.shape
x=torch.mean(x,dim=1,keepdim=True)
# x=torch.unsqueeze(x, 1)
x=F.tanh(self.Conv2d_1(x))
x=F.tanh(self.Conv2d_2(x))
# x=self.MaxPool2d_1(x)
x=F.tanh(self.Conv2d_3(x))
x=F.tanh(self.Conv2d_4(x))
x=F.tanh(self.Conv2d_5(x))
x=self.flatten(x)
print(x.shape)
x=F.tanh(self.fc_1(x))
x=F.tanh(self.fc_2(x))
x=F.tanh(self.fc_3(x))
x=F.tanh(self.fc_4(x))
x=F.sigmoid(self.fc_5(x))
#
# x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
# x=nn.Flatten(start_dim=2, end_dim=4)(x)
# x,_=self.lstm_1(x)
# x=x[:,-1,:]
# x=nn.LeakyReLU()(self.fc_1(x))
# x=nn.LeakyReLU()(self.fc_2(x))
# x=nn.Sigmoid()(self.fc_3(x))
return x
if __name__=="__main__":
model=SpectralModel()
torchinfo.summary(model,input_size=(64, 20, 224, 10))

View File

@ -3,9 +3,9 @@ import torch.nn.functional as F
import torchinfo
import torch
class CNN_LSTM_FCNN(nn.Module):
class SpectralModel(nn.Module):
def __init__(self,):
super(CNN_LSTM_FCNN, self).__init__()
super(SpectralModel, self).__init__()
# self.reshape_1=nn.Flatten(start_dim=0, end_dim=1)
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=256, kernel_size=3,stride=1)
@ -27,7 +27,7 @@ class CNN_LSTM_FCNN(nn.Module):
self.fc_1=nn.Linear(in_features=2048,out_features=1024) #2048->1024
self.fc_2=nn.Linear(in_features=1024,out_features=256)
self.fc_3=nn.Linear(in_features=256,out_features=4)
self.fc_3=nn.Linear(in_features=256,out_features=8)
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
@ -64,6 +64,6 @@ class CNN_LSTM_FCNN(nn.Module):
return x
if __name__=="__main__":
model=CNN_LSTM_FCNN()
model=SpectralModel()
torchinfo.summary(model,input_size=(64, 48, 224, 10))
torchinfo.summary(model,input_size=(64, 20, 224, 10))

View File

@ -0,0 +1,55 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torch
class SpectralModel(nn.Module):
def __init__(self,):
super(SpectralModel, self).__init__()
self.norm=nn.BatchNorm1d(224)
self.fc1 = nn.Linear(224, 1024)
self.fc2 = nn.Linear(1024, 2048)
self.fc3 = nn.Linear(2048, 4096)
self.fc4 = nn.Linear(4096, 2048)
self.fc5 = nn.Linear(2048, 1024)
self.fc6 = nn.Linear(1024, 512)
self.fc7 = nn.Linear(512, 128)
self.fc8 = nn.Linear(128, 8)
def forward(self, x):
x=self.norm(x)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = F.relu(self.fc8(x))
x = F.sigmoid(x)
return x
# class SpectralModel(nn.Module):
# def __init__(self,):
# super(SpectralModel, self).__init__()
# self.norm=nn.BatchNorm1d(224)
# self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)])
# def forward(self, x):
# x=self.norm(x)
# res = [submodel(x) for submodel in self.submodels]
# x=torch.cat(res,dim=1)
# return x
if __name__=="__main__":
model=SpectralModel().to("cuda:0")
torchinfo.summary(model,input_size=(64, 224))

View File

@ -0,0 +1,68 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torch
class SmallFCNNModel(nn.Module):
def __init__(self,):
super(SmallFCNNModel, self).__init__()
self.fc1 = nn.Linear(224, 50)
self.fc9 = nn.Linear(50, 1)
def forward(self, x):
# x=self.norm(x)
x = F.relu(self.fc1(x))
x = F.sigmoid(self.fc9(x))
return x
class SpectralModel(nn.Module):
def __init__(self,):
super(SpectralModel, self).__init__()
self.norm=nn.BatchNorm1d(224)
# self.fc1 = nn.Linear(100, 50)
# # self.fc2 = nn.Linear(1024, 2048)
# # self.fc3 = nn.Linear(2048, 4096)
# # self.fc4 = nn.Linear(4096, 2048)
# # self.fc5 = nn.Linear(2048, 1024)
# # self.fc6 = nn.Linear(1024, 512)
# # self.fc7 = nn.Linear(512, 256)
# # self.fc8 = nn.Linear(256, 128)
# self.fc9 = nn.Linear(50, 8)
# # self.fc4 = nn.Linear(10240, 4)
self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)])
def forward(self, x):
x=self.norm(x)
res = [submodel(x) for submodel in self.submodels]
# res=[self.submodels[0](x)]
# for i in range(1,len(self.submodels)):
# res.append(self.submodels[i](0*x))
# x = F.relu(self.fc2(x))
# x = F.relu(self.fc3(x))
# x = F.relu(self.fc4(x))
# x = F.relu(self.fc5(x))
# x = F.relu(self.fc6(x))
# x = F.relu(self.fc7(x))
# x = F.relu(self.fc8(x))
# x = F.sigmoid(self.fc9(x))
x=torch.cat(res,dim=1)
return x
if __name__=="__main__":
model=SpectralModel().to("cuda:0")
torchinfo.summary(model,input_size=(64, 224))

Binary file not shown.

View File

@ -22,6 +22,8 @@ import numpy as np
import matplotlib.pyplot as plt
from optimizer.utils import save_inference_files
logger = logging.getLogger("ray")
def trainable(hydra_cfg,tune_cfg):
@ -39,7 +41,7 @@ def trainable(hydra_cfg,tune_cfg):
## 原始数据预处理为数据集 ##
############################
t_1=time.time()
data_preprocess=DataPreProcess( hydra_cfg.raw_spectral_data_dir,hydra_cfg.raw_labels_dir,hydra_cfg.labels_name,hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]),hydra_cfg.dataset.train_ratio,hydra_cfg.dataset.validate_ratio)
data_preprocess=DataPreProcess( hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]))
t_2=time.time()
logger.info(f"Preprocessed raw data costs {t_2-t_1}s")
@ -47,12 +49,18 @@ def trainable(hydra_cfg,tune_cfg):
## 加载数据集为dataloader ##
############################
train_dataset,val_dataset=load_dataset(data_preprocess.dataset_save_dir)
train_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="training")
val_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="validation")
trainloader = torch.utils.data.DataLoader(
train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
)
valloader = torch.utils.data.DataLoader(
val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
)
t_3=time.time()
logger.info(f"Dataloader costs {t_3-t_2}s")
@ -101,7 +109,7 @@ def trainable(hydra_cfg,tune_cfg):
## 生成模型架构图写入tensorboad ##
################################
sample = train_dataset[0:4][0]
print(sample.shape)
# print(sample.shape)
writer.add_graph(model, torch.tensor(sample).to(device))
@ -113,6 +121,7 @@ def trainable(hydra_cfg,tune_cfg):
################################
## 模型训练 ##
################################
logger.info(f"Start epoch {epoch+1}")
train_loss = 0.0
train_steps = 0
@ -121,12 +130,29 @@ def trainable(hydra_cfg,tune_cfg):
train_errors=None
train_outputs=None
train_labels=None
train_bounds=None
model.train()
# t_1=time.time()
# for batch, (features, labels) in enumerate(trainloader):
# features=features.to(device)
# labels=labels.to(device)
# t_2=time.time()
# print(f"DEBUDING cost{t_2-t_1}s")
for batch, (features, labels) in enumerate(trainloader):
t_iteration_start=time.time()
features=features.to(device)
labels=labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
@ -136,9 +162,13 @@ def trainable(hydra_cfg,tune_cfg):
labels_npy=labels.cpu().detach().numpy()
outputs_npy=outputs.cpu().detach().numpy()
# print("outputs_npy",outputs_npy)
# print("labels_npy",labels_npy)
# 生成自定义的指标
error,hit_rate=data_preprocess.get_metric(outputs_npy, labels_npy)
outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
# print("outputs_npy_after",outputs_npy)
# print("labels_npy_after",labels_npy)
# Backpropagation
loss.backward()
@ -147,16 +177,21 @@ def trainable(hydra_cfg,tune_cfg):
# 记录我们自定义的指标
train_loss += loss.item()
train_steps += 1
if train_steps==1:
train_hit_rate=hit_rate
train_errors=error
train_outputs=outputs_npy
train_labels=labels_npy
train_bounds=bounds
else:
train_hit_rate=(train_steps-1)/train_steps *train_hit_rate+ 1/train_steps*hit_rate
train_errors=np.concatenate((train_errors,error),axis=0)
train_outputs=np.concatenate((train_outputs,outputs_npy),axis=0)
train_labels=np.concatenate((train_labels,labels_npy),axis=0)
train_bounds=np.concatenate((train_bounds,bounds),axis=0)
t_iteration_end=time.time()
print("Training Epoch:[{}/{}],Iteration:{}/{},Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_steps,len(trainloader),loss.item(),t_iteration_end-t_iteration_start))
@ -168,6 +203,9 @@ def trainable(hydra_cfg,tune_cfg):
val_loss = 0.0
val_steps = 0
val_errors=None
val_outputs=None
val_labels=None
val_hit_rate=None
t_epoch_train_end=time.time()
@ -177,20 +215,30 @@ def trainable(hydra_cfg,tune_cfg):
for batch, (features, labels) in enumerate(valloader):
t_iteration_start=time.time()
with torch.no_grad():
features=features.to(device)
labels=labels.to(device)
outputs = model(features)
loss = criterion(outputs, labels)
error,hit_rate=data_preprocess.get_metric(outputs.cpu().detach().numpy(), labels.cpu().detach().numpy())
labels_npy=labels.cpu().detach().numpy()
outputs_npy=outputs.cpu().detach().numpy()
outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
val_loss += loss.cpu().numpy()
val_steps += 1
if val_steps==1:
val_hit_rate=hit_rate
val_errors=error
val_outputs=outputs_npy
val_labels=labels_npy
else:
val_hit_rate=(val_steps-1)/val_steps *val_hit_rate+ 1/val_steps*hit_rate
val_errors=np.concatenate((val_errors,error),axis=0)
val_outputs=np.concatenate((val_outputs,outputs_npy),axis=0)
val_labels=np.concatenate((val_labels,labels_npy),axis=0)
t_iteration_end=time.time()
print("Validate Iteration:{}/{},Loss:{}, Cost {}s".format(val_steps,len(valloader),loss.item(),t_iteration_end-t_iteration_start))
@ -206,41 +254,157 @@ def trainable(hydra_cfg,tune_cfg):
checkpoint_data = {
"epoch": epoch,
"net_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
}
with tempfile.TemporaryDirectory() as checkpoint_dir:
data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
with open(data_path, "wb") as fp:
pickle.dump(checkpoint_data, fp)
checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)
reports={"val_loss": val_loss / val_steps,
"train_loss": train_loss,
}
for i,name in enumerate(hydra_cfg.labels_name):
reports[f"train_hit_rate_{name}"]=train_hit_rate[i]
reports[f"val_hit_rate_{name}"]=val_hit_rate[i]
writer.add_histogram(tag=f'train_error_{name}', values=train_errors[:,i], global_step=epoch)
for i,name in enumerate(data_preprocess.labelNames):
# reports[=
writer.add_scalar(f"{name}/training/hit_rate",train_hit_rate[i],global_step=epoch)
writer.add_scalar(f"{name}/validation/hit_rate",val_hit_rate[i],global_step=epoch)
if (epoch!=0 and epoch%hydra_cfg.train.checkpoint_interval==0):
for i,name in enumerate(data_preprocess.labelNames):
# reports[=
writer.add_histogram(tag=f'{name}/training/error_histogram', values=train_errors[:,i], global_step=epoch)
writer.add_histogram(tag=f'{name}/validation/error_histogram', values=val_errors[:,i], global_step=epoch)
fig, ax = plt.subplots()
ax.scatter(np.arange(train_outputs.shape[0]),train_outputs[:,i])
ax.scatter(np.arange(train_outputs.shape[0]),train_labels[:,i])
writer.add_figure(f'train_imgage_{name}', fig , epoch)
ax.scatter(train_labels[:,i],train_outputs[:,i])
ax.plot([train_labels[:,i].min(), train_labels[:,i].min()], [train_labels[:,i].max(), train_labels[:,i].max()], 'r--')
ax.plot(train_labels[:,i],train_labels[:,i]-train_bounds[:,i], 'r--')
ax.plot(train_labels[:,i],train_labels[:,i]+train_bounds[:,i], 'r--')
ax.set_xlabel('Actual Values')
ax.set_ylabel('Estimated Values')
writer.add_figure(f'{name}/training/actual_vs_estimated_value', fig , epoch)
plt.close(fig)
fig, ax = plt.subplots()
ax.scatter(val_labels[:,i],val_outputs[:,i])
ax.plot([val_labels[:,i].min(), val_labels[:,i].max()], [val_outputs[:,i].min(), val_outputs[:,i].max()], 'r--')
ax.set_xlabel('Actual Values')
ax.set_ylabel('Estimated Values')
writer.add_figure(f'{name}/validation/actual_vs_estimated_value', fig , epoch)
plt.close(fig)
fig, ax = plt.subplots()
ax.scatter(train_labels[:,i],train_errors[:,i])
ax.plot(train_labels[:,i],[0 for i in range(train_labels.shape[0])], color='r', linestyle='--')
ax.plot(train_labels[:,i],-train_bounds[:,i], 'r--')
ax.plot(train_labels[:,i],+train_bounds[:,i], 'r--')
ax.set_xlabel('Actual Values')
ax.set_ylabel('Residual error')
writer.add_figure(f'{name}/training/training_actual_vs_residual', fig , epoch)
plt.close(fig)
fig, ax = plt.subplots()
ax.scatter(val_labels[:,i],val_errors[:,i])
ax.plot(val_labels[:,i],[0 for i in range(val_labels.shape[0])], color='r', linestyle='--')
ax.set_xlabel('Actual Values')
ax.set_ylabel('Residual error')
writer.add_figure(f'{name}/validation/training_actual_vs_Residual', fig , epoch)
plt.close(fig)
with tempfile.TemporaryDirectory() as checkpoint_dir:
checkpoint_data = {
"epoch": epoch,
"net_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"data_preprocess":data_preprocess
}
data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
with open(data_path, "wb") as fp:
pickle.dump(checkpoint_data, fp)
save_inference_files(model,data_preprocess,checkpoint_dir)
checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)
ray.train.report(
reports,
checkpoint=checkpoint,
)
else:
ray.train.report(reports,)
t_epoch_end=time.time()
logger.info(f"Save Checkpoint costs {t_epoch_end-t_epoch_val_end}s")
print("Training Epoch:[{}/{}], Average Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_loss,t_epoch_end-t_epoch_start))
for i,name in enumerate(hydra_cfg.labels_name):
print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end=" ")
print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end=" ")
# for i,name in enumerate(hydra_cfg.labels_name):
# print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end=" ")
# print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end=" ")
#####################################################
## 每个epoch保存checkpoint并记录到tensorboard ##
#####################################################
#测试
test_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="test")
testloader = torch.utils.data.DataLoader(
test_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
)
test_loss = 0.0
test_steps = 0
test_errors=None
test_outputs=None
test_labels=None
test_hit_rate=None
logger.info(f"Test starts")
t_epoch_test_start=time.time()
model.eval()
for batch, (features, labels) in enumerate(testloader):
t_iteration_start=time.time()
with torch.no_grad():
features=features.to(device)
labels=labels.to(device)
outputs = model(features)
loss = criterion(outputs, labels)
labels_npy=labels.cpu().detach().numpy()
outputs_npy=outputs.cpu().detach().numpy()
outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
test_loss += loss.cpu().numpy()
test_steps += 1
if test_steps==1:
test_hit_rate=hit_rate
test_errors=error
test_outputs=outputs_npy
test_labels=labels_npy
else:
test_hit_rate=(test_steps-1)/test_steps *test_hit_rate+ 1/test_steps*hit_rate
test_errors=np.concatenate((test_errors,error),axis=0)
test_outputs=np.concatenate((test_outputs,outputs_npy),axis=0)
test_labels=np.concatenate((test_labels,labels_npy),axis=0)
t_iteration_end=time.time()
print("Test Iteration:{}/{},Loss:{}, Cost {}s".format(test_steps,len(testloader),loss.item(),t_iteration_end-t_iteration_start))
t_epoch_test_end=time.time()
logger.info(f"Test costs {t_epoch_test_end-t_epoch_test_start}s")
tmp_string=""
for i,name in enumerate(data_preprocess.labelNames):
tmp_string+=f"{name}\t:{test_hit_rate[i]*100:.2f}%\n"
writer.add_text(f"test/hit_rate",tmp_string)

48
src/optimizer/utils.py Normal file
View File

@ -0,0 +1,48 @@
import shutil
import pathlib
import pickle
import torch
def save_inference_files(model,data_preprocess,checkpoint_dir):
sava_dir=pathlib.Path(checkpoint_dir)/"inference"
print("开始在checkpoint中保存推理所需文件")
print("choose_frame_spatial文件路径",data_preprocess.choose_frame_spatial.file_path.parent)
#保存choose_frame_spatial相关文件
sava_dir.mkdir(mode=0o777, parents=True, exist_ok=True)
shutil.copy(data_preprocess.choose_frame_spatial.file_path,sava_dir/"choose_frame_spatial.py")
with open(sava_dir/"choose_frame_spatial.pkl",'wb') as f:
pickle.dump(data_preprocess.choose_frame_spatial.state_dict(),f)
shutil.copy(data_preprocess.features_scaling.file_path,sava_dir/"features_scaling.py")
with open(sava_dir/"features_scaling.pkl",'wb') as f:
pickle.dump(data_preprocess.features_scaling.state_dict(),f)
shutil.copy(data_preprocess.labels_scaling.file_path,sava_dir/"labels_scaling.py")
with open(sava_dir/"labels_scaling.pkl",'wb') as f:
pickle.dump(data_preprocess.labels_scaling.state_dict(),f)
with open(sava_dir/"labelNames.pkl",'wb') as f:
pickle.dump(data_preprocess.labelNames,f)
input_tensor = torch.rand((1,*data_preprocess.features_scaling.feature_shape), dtype=torch.float32).to("cuda:0")
torch.onnx.export(
model, # model to export
(input_tensor,), # inputs of the model,
str(sava_dir/"model.onnx"), # filename of the ONNX model
input_names=["input"], # Rename inputs for the ONNX model
dynamo=True # True or False to select the exporter to use
)

149
src/scripts/Add_Lab.py Normal file
View File

@ -0,0 +1,149 @@
import pandas as pd
# from tqdm import tqdm
if __name__ =="__main__":
label_file_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335.xlsx"
raw_label_lab_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份转炉TSC和TSO化验结果.xls"
save_dir="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx"
labels=pd.read_excel(label_file_path)
print(f"原始数据量{labels.shape[0]}")
labels=labels.loc[:,["Furnace_Number","Steel_Type","TSC_start_time","TSC_end_time","TSC_T","TSC_C","TSO_start_time","TSO_end_time","TSO_T","TSO_C"]]
#下述功能集成到数据预处理中。因为每一行但凡有0就删太严格应该要求指定的label有0就删
# # 选出有NULL的行
null_rows=labels.isnull().any(axis=1)
# # 选出有0的行
zeros_rows=(labels==0).any(axis=1) | (labels=='0').any(axis=1)
# # 每一行但凡有NULL或者0都给删了
selected_rows=~(null_rows|zeros_rows)
labels=labels[selected_rows]
print(f"删除无效数据后{labels.shape[0]}")
labels_output=labels.copy()
labels_output['TSC_P']=0.0
labels_output['TSC_S']=0.0
labels_output['TSC_Mn']=0.0
labels_output['TSC_Ni']=0.0
labels_output['TSC_Mo']=0.0
labels_output['TSC_Cr']=0.0
labels_output['TSO_P']=0.0
labels_output['TSO_S']=0.0
labels_output['TSO_Mn']=0.0
labels_output['TSO_Ni']=0.0
labels_output['TSO_Mo']=0.0
labels_output['TSO_Cr']=0.0
raw_label_lab=pd.read_excel(raw_label_lab_file_path)
raw_label_lab=raw_label_lab.loc[:,["炉次号","分析时间","Mn","P","S","Ni","Cr","Mo"]]
# print(raw_label_lab)
for i in range(labels.shape[0]):
# i=4
furnaceID=labels.iloc[i]["Furnace_Number"]
# print(raw_label_lab[raw_label_lab["炉次号"]==furnaceID])
tmp=raw_label_lab[raw_label_lab["炉次号"]==furnaceID]
if tmp.shape[0]==0:
print(f"炉次号{furnaceID}找不到对应的数据")
continue
elif tmp.shape[0]==1:
print(f"炉次号{furnaceID}找到1个")
labels_output.loc[i,'TSC_P']=tmp.iloc[0]["P"]
labels_output.loc[i,'TSC_S']=tmp.iloc[0]["S"]
labels_output.loc[i,'TSC_Mn']=tmp.iloc[0]["Mn"]
labels_output.loc[i,'TSC_Ni']=tmp.iloc[0]["Ni"]
labels_output.loc[i,'TSC_Mo']=tmp.iloc[0]["Mo"]
labels_output.loc[i,'TSC_Cr']=tmp.iloc[0]["Cr"]
labels_output.loc[i,'TSO_P']=tmp.iloc[0]["P"]
labels_output.loc[i,'TSO_S']=tmp.iloc[0]["S"]
labels_output.loc[i,'TSO_Mn']=tmp.iloc[0]["Mn"]
labels_output.loc[i,'TSO_Ni']=tmp.iloc[0]["Ni"]
labels_output.loc[i,'TSO_Mo']=tmp.iloc[0]["Mo"]
labels_output.loc[i,'TSO_Cr']=tmp.iloc[0]["Cr"]
else:
print(f"炉次号{furnaceID}找到{tmp.shape[0]}")
#寻找离TSC最近的
min_time=None
min_index=0
for j in range(tmp.shape[0]):
# print(j,i)
delta_time=tmp.iloc[j]["分析时间"]-labels.iloc[i]["TSC_end_time"]
if min_time is None:
min_time=delta_time
else:
if delta_time<min_time:
min_time=delta_time
min_index=j
# print(min_time,min_index)
labels_output.loc[i,'TSC_P']=tmp.iloc[min_index]["P"]
labels_output.loc[i,'TSC_S']=tmp.iloc[min_index]["S"]
labels_output.loc[i,'TSC_Mn']=tmp.iloc[min_index]["Mn"]
labels_output.loc[i,'TSC_Ni']=tmp.iloc[min_index]["Ni"]
labels_output.loc[i,'TSC_Mo']=tmp.iloc[min_index]["Mo"]
labels_output.loc[i,'TSC_Cr']=tmp.iloc[min_index]["Cr"]
# labels.iloc[i]['TSO_P']=tmp.iloc[i]["P"]
# labels.iloc[i]['TSO_S']=tmp.iloc[i]["S"]
# labels.iloc[i]['TSO_Mn']=tmp.iloc[i]["Mn"]
# labels.iloc[i]['TSO_Ni']=tmp.iloc[i]["Ni"]
# labels.iloc[i]['TSO_Mo']=tmp.iloc[i]["Mo"]
# labels.iloc[i]['TSO_Cr']=tmp.iloc[i]["Cr"]
#寻找离TSO最近的
min_time=None
min_index=0
for j in range(tmp.shape[0]):
# print(j,i)
# print(tmp.iloc[j]["分析时间"],labels.iloc[i]["TSO_end_time"])
delta_time=tmp.iloc[j]["分析时间"]-labels.iloc[i]["TSO_end_time"]
if min_time is None:
min_time=delta_time
else:
if delta_time<min_time:
min_time=delta_time
min_index=j
# print(min_time,min_index)
labels_output.loc[i,'TSO_P']=tmp.iloc[min_index]["P"]
labels_output.loc[i,'TSO_S']=tmp.iloc[min_index]["S"]
labels_output.loc[i,'TSO_Mn']=tmp.iloc[min_index]["Mn"]
labels_output.loc[i,'TSO_Ni']=tmp.iloc[min_index]["Ni"]
labels_output.loc[i,'TSO_Mo']=tmp.iloc[min_index]["Mo"]
labels_output.loc[i,'TSO_Cr']=tmp.iloc[min_index]["Cr"]
null_rows=labels_output.isnull().any(axis=1)
# # 选出有0的行
zeros_rows=(labels_output==0).any(axis=1) | (labels_output=='0').any(axis=1)
# # 每一行但凡有NULL或者0都给删了
selected_rows=~(null_rows|zeros_rows)
labels_output=labels_output[selected_rows]
print(labels_output)
print(f"插入实验室数据的后的大小{labels_output.shape[0]}")
labels_output.to_excel(save_dir,index=False)
# for i in range(labels.shape[0]):
# print(f"[{i+1}/{labels.shape[0]}],炉次号:{labels.iloc[i]["Furnace_Number"]}")
# furnaceID=labels.iloc[i]["Furnace_Number"]

View File

@ -16,9 +16,10 @@ import os
if __name__ =="__main__":
raw_label_file_path="/home/admin/20240806-NanEr-5-8-data/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
raw_label_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
# raw_label_lab_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份转炉TSC和TSO化验结果.xls"
save_dir="/home/admin/20240806-NanEr-5-8-data/labels/NanEr"
save_dir="/data/SEMS-model-training/labels/NanEr"
select_and_rename={
"炉号":"Furnace_Number",

View File

@ -0,0 +1,60 @@
import multiprocessing.pool
import pathlib
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.cluster import OPTICS
import multiprocessing
def plot_npz(file_path:pathlib.Path,output_folder:pathlib.Path)->None:
output_folder=output_folder
output_folder.mkdir(mode=0o777,parents=True,exist_ok=True)
data=np.load(file_path,allow_pickle=True)
#此处data["rawSpectralData"]的维度为(时间维度,光谱维度,空间维度)
rawSpectralData=data["rawSpectralData"]
plt.figure()
spectrum_band=np.linspace(400,1000,224)
# plt.ylim([0,4095])
plt.plot(spectrum_band,rawSpectralData)
# plt.title("Max Point Spectrum")
plt.savefig(output_folder/f"{file_path.stem}.png",dpi=100)
plt.close()
#画最强点的光谱
def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None:
files=list(raw_data_folder.glob("*.npz"))
# plot_npz(files[0],output_folder )
p=multiprocessing.Pool(4)
for i in range(len(files)):
p.apply_async(plot_npz, args=(files[i],output_folder))
print('Waiting for all subprocesses done...')
p.close()
p.join()
print('All subprocesses done.')
if __name__=="__main__":
raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset")
output_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset_visual")
main(raw_dataset_folder/"test",output_folder/"test",)
main(raw_dataset_folder/"training",output_folder/"training",)
main(raw_dataset_folder/"validation",output_folder/"validation",)

View File

@ -0,0 +1,265 @@
import multiprocessing.pool
import pathlib
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.cluster import OPTICS
import multiprocessing
def plot_npz(file_path:pathlib.Path,output_folder:pathlib.Path)->None:
output_folder=output_folder/file_path.stem
output_folder.mkdir(mode=0o777,parents=True,exist_ok=True)
data=np.load(file_path,allow_pickle=True)
#此处data["rawSpectralData"]的维度为(时间维度,光谱维度,空间维度)
rawSpectralData=data["rawSpectralData"].transpose(0, 2, 1)
rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
#rawSpectralData的维度为时间维度*空间维度,光谱维度)
tmp=np.max(rawSpectralData,axis=1)
rawSpectralData=rawSpectralData[(tmp>500) & (tmp<4095),:]
#选出没有欠曝与过曝的空间点
rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
#归一化所有光谱,去除强度信息
plt.figure()
spectrum_band=np.linspace(400,1000,224)
index=np.argmax(rawSpectralData)
index=np.unravel_index(index, rawSpectralData.shape)
plt.ylim([0,4095])
plt.plot(spectrum_band,rawSpectralData[index[0],:])
plt.title("Max Point Spectrum")
plt.savefig(output_folder/"max_point_spectrum.png",dpi=100)
plt.close()
#画最强点的光谱
################################
# PCA
################################
# pca_norm_1D = PCA(n_components=1)
# norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
# pca_norm_2D = PCA(n_components=2)
# norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
# pca_norm_3D = PCA(n_components=3)
# norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
# pca_1D = PCA(n_components=1)
# feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData)
# pca_2D = PCA(n_components=2)
# feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData)
# pca_3D = PCA(n_components=3)
# feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData)
# fig, axs = plt.subplots(2, 3,figsize=(15,10))
# axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
# axs[0, 0].set_title(' norm')
# axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
# ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
# ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
# axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
# axs[1, 0].set_title(' raw')
# axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
# ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
# ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
# plt.savefig(output_folder/"PCA.png",dpi=100)
# plt.close()
################################
# KernelPCA gamma =15
################################
def plot_KernelPCA(gamma):
pca_norm_1D = KernelPCA(n_components=1, kernel='rbf', gamma=gamma)
norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
pca_norm_2D = KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
pca_norm_3D = KernelPCA(n_components=3, kernel='rbf', gamma=gamma)
norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
pca_1D = KernelPCA(n_components=1, kernel='rbf', gamma=gamma)
feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData)
pca_2D = KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData)
pca_3D = KernelPCA(n_components=3, kernel='rbf', gamma=gamma)
feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData)
fig, axs = plt.subplots(2, 3,figsize=(15,10))
axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
axs[0, 0].set_title(' norm')
axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
axs[1, 0].set_title(' raw')
axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
plt.savefig(output_folder/f"KernelPCA_gamma_{gamma}.png",dpi=100)
plt.close()
# plot_KernelPCA(gamma=None)
# plot_KernelPCA(gamma=1)
# plot_KernelPCA(gamma=5)
# plot_KernelPCA(gamma=10)
# plot_KernelPCA(gamma=15)
################################
# t-SNE
################################
# pca_norm_1D = TSNE(n_components=1, learning_rate='auto', init='pca')
# norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
# pca_norm_2D = TSNE(n_components=2, learning_rate='auto', init='pca')
# norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
# pca_norm_3D = TSNE(n_components=3, learning_rate='auto', init='pca')
# norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
# pca_1D = TSNE(n_components=1, learning_rate='auto', init='pca')
# feat_1D = pca_1D.fit(rawSpectralData).fit_transform(rawSpectralData)
# pca_2D = TSNE(n_components=2, learning_rate='auto', init='pca')
# feat_2D = pca_2D.fit(rawSpectralData).fit_transform(rawSpectralData)
# pca_3D = TSNE(n_components=3, learning_rate='auto', init='pca')
# feat_3D = pca_3D.fit(rawSpectralData).fit_transform(rawSpectralData)
# fig, axs = plt.subplots(2, 3,figsize=(15,10))
# axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
# axs[0, 0].set_title(' norm')
# axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
# ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
# ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
# axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
# axs[1, 0].set_title(' raw')
# axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
# ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
# ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
# plt.savefig(output_folder/"t-SNE.png",dpi=100)
# plt.close()
def plot_DBSCAN(eps=0.15, min_samples=10):
db_norm = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData_normed)
labels_norm = db_norm.labels_
n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
n_noise_norm = list(labels_norm).count(-1)
max_i=0
max_num=0
for i in range(n_norm):
tmp=(labels_norm==i).sum()
if tmp>max_num:
max_i=i
max_num=tmp
fig, axs = plt.subplots(2, 2,figsize=(10,10))
for i in range(labels_norm.shape[0]):
if labels_norm[i]==max_i:
axs[0,0].plot(rawSpectralData_normed[i])
axs[0,0].set_title(f'norm data w norm cluster {max_num},{n_norm},{n_noise_norm}')
for i in range(labels_norm.shape[0]):
if labels_norm[i]==max_i:
axs[0,1].plot(rawSpectralData[i])
axs[0,1].set_title('norm data w norm cluster')
db_raw = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData)
labels_raw = db_raw.labels_
n_raw = len(set(labels_raw)) - (1 if -1 in labels_raw else 0)
n_noise_raw = list(labels_raw).count(-1)
max_i=0
max_num=0
for i in range(n_raw):
tmp=(labels_raw==i).sum()
if tmp>max_num:
max_i=i
max_num=tmp
for i in range(labels_raw.shape[0]):
if labels_raw[i]==max_i:
axs[1,0].plot(rawSpectralData_normed[i])
axs[1,0].set_title(f'norm data w norm cluster {max_num},{n_raw},{n_noise_raw}')
for i in range(labels_raw.shape[0]):
if labels_raw[i]==max_i:
axs[1,1].plot(rawSpectralData[i])
axs[1,1].set_title('norm data w norm cluster')
plt.savefig(output_folder/f"DBSCAN_eps_{eps}_min_samples_{min_samples}.png",dpi=100)
plt.close()
plot_DBSCAN(eps=0.15, min_samples=10)
# plot_DBSCAN(eps=100, min_samples=10)
# print(file_path.stem,rawSpectralData.shape)
def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None:
files=list(raw_data_folder.glob("*.npz"))
# plot_npz(files[0],output_folder )
p=multiprocessing.Pool(4)
for i in range(len(files)):
p.apply_async(plot_npz, args=(files[i],output_folder))
print('Waiting for all subprocesses done...')
p.close()
p.join()
print('All subprocesses done.')
if __name__=="__main__":
raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset")
output_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset_visual")
main(raw_dataset_folder/"test",output_folder/"test",)
main(raw_dataset_folder/"training",output_folder/"training",)
main(raw_dataset_folder/"validation",output_folder/"validation",)

View File

@ -0,0 +1,276 @@
'''
@File : pre_process.py
@Time : 2024/08/09 13:54:28
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 进行数据预处理
'''
import pandas as pd
import logging
import os,glob
import numpy as np
import datetime
import pickle
import h5py
class DataPreProcess:
"""
包含所有数据预处理操作包括
1. 从原始二进制数据以钢厂 提供的Excel文件中提取出目标时刻附近的光谱
2. 从目标时刻附近的光谱选择合适稳定的时间及空间点的光谱作为feature
3. 将feature Label进行放缩
"""
def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,train_ratio=0.8,validate_ratio=0.1) -> None:
"""初始化类,传入所有数据预处理需要的参数
Args:
raw_spectral_data_dir (_type_):原始光谱数据路径
raw_labels_dir (_type_): 原始标签路径
labels_name (_type_): 提取出的标签名
dataset_dir (_type_): 生成的数据集文件夹
choose_frame_spatial (_type_): 类对象进行光谱特征挑选
features_scaling (_type_): 类对象对输入的特征进行放缩使之直接输入神经网络
labels_scaling (_type_): 类对象对输出的的标签进行放缩
train_ratio (float, optional): 训练集比例. Defaults to 0.8.
validate_ratio (float, optional): 验证集比例剩余的即为测试集. Defaults to 0.1.
"""
self.raw_spectral_data_dir=raw_spectral_data_dir
self.raw_labels_dir=raw_labels_dir
self.dataset_dir=dataset_dir
self.labels_name=labels_name
self.train_ratio=train_ratio
self.validate_ratio=validate_ratio
#加载原始的标签
self.raw_labels=pd.read_excel(raw_labels_dir)
#加载原始光谱的缓存
self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
#随便加载一个csv文件判断光谱数据的维度
self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
#正式开始原始数据转化为数据集
self._raw_data_2_dataset()
# def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
# """读取利用脚本处理后的钢厂给的excel文件所在文件夹会扫描所有文件
# 并选择指定的label名作为output
# 并去除值为NaN或0的行
# Args:
# raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
# labels_name (list): 指定的作为Label的列
# Returns:
# pd.DataFrame: 返回所有筛选后的炉次数据
# """
# raw_labels=None
# for name in os.listdir(raw_labels_dir):
# tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
# choosed_column=["TSC_start_time","TSC_end_time"]
# choosed_column=choosed_column+labels_name
# #只选出我们想要的部分作为标签
# tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
# # 选出有NULL的行
# null_rows=tmp_raw_labels.isnull().any(axis=1)
# # # 选出有0的行
# zeros_rows=(tmp_raw_labels==0).any(axis=1) | (tmp_raw_labels=='0').any(axis=1)
# # # 每一行但凡有NULL或者0都给删了
# selected_rows=~(null_rows|zeros_rows)
# tmp_raw_labels=tmp_raw_labels[selected_rows]
# if raw_labels is None:
# raw_labels=tmp_raw_labels
# else:
# raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
# logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
# logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
# return raw_labels
def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
"""生成所有原始光谱数据文件的缓存,包括每个文件记录的开始及结束时间,目的为加快后面读取原始数据的速度
Args:
raw_spectral_data_dir (str): 原始光谱所在路径
Returns:
list: 缓存其中有多少个成员就有多少个原始数据文件每个成员包括格式为datetime的开始及结束时间及文件路径
"""
spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
update_flag=False
#不存在缓存文件,以及缓存文件中数据文件个数与文件夹中的文件个数不一致,均重新生成
if len(cache_file_paths)==0:
logging.debug(f"Raw spectral data cache is not existed! Generating")
update_flag=True
elif len(cache_file_paths)==1:
with open(cache_file_paths[0],"rb") as f:
raw_spectral_data_cache=pickle.load(f)
if len(raw_spectral_data_cache) !=len(spectral_file_paths):
logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
update_flag=True
else:
logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
if update_flag:
spectral_file_paths.sort()
raw_spectral_data_cache=[]
for file in spectral_file_paths:
tmp_info={}
tmp_data=np.loadtxt(file, delimiter=",")
start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
tmp_info["start_t"]=start_t
tmp_info["end_t"]=end_t
tmp_info["file_path"]=file
raw_spectral_data_cache.append(tmp_info)
with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
pickle.dump(raw_spectral_data_cache,f)
return raw_spectral_data_cache
def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
if data[0,2]==229376:
spectral_dim =224
spatial_dim=512
if data[0,2]==917504:
spectral_dim =448
spatial_dim=1024
return spectral_dim, spatial_dim
def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
"""获取从start_time到end_time的光谱数据
Args:
start_time (datetime.datetime): 开始时间
end_time (datetime.datetime): 结束时间
Returns:
np.ndarray: 原始光谱数据
"""
def get_spectral_data_per_file(file_path,s_t,e_t):
data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
if s_t is not None:
tmp_s=datetime.datetime.timestamp(s_t)*1000
tmp_s_index=0
for i in range(data.shape[0]-1):
if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
tmp_s_index=i
break
else:
tmp_s_index=0
if e_t is not None:
tmp_e=datetime.datetime.timestamp(e_t)*1000
tmp_e_index=data.shape[0]
for i in range(tmp_s_index,data.shape[0]-1):
if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
tmp_e_index=i
break
else:
tmp_e_index=data.shape[0]
with open(file_path[:-3]+"bin", "rb") as f:
f.seek(data[tmp_s_index,1])
d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
return data[tmp_s_index:tmp_e_index,0],d
timestamps=None
raw_spectral_data=None
for tmp_info in self.raw_spectral_data_cache:
tmp_data=None
if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and end_time<tmp_info["end_t"]:
# 目标时间段在交于此文件时间段的左侧。所以取从文件开始到end time的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
elif start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"]:
# 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
elif start_time>tmp_info["start_t"] and end_time<tmp_info["end_t"]:
# 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and end_time>tmp_info["end_t"]:
# 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
if tmp_data is not None:
if raw_spectral_data is None:
timestamps=tmp_time_stamp
raw_spectral_data=tmp_data
else:
timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
return timestamps,raw_spectral_data
def _raw_data_2_dataset(self):
save_dir=os.path.join(self.dataset_dir)
#第一步,进行特征时间与空间点选取,并保存
pre_dataset_save_dir=os.path.join(save_dir,"raw_dataset")
if not os.path.exists(pre_dataset_save_dir):
os.makedirs(pre_dataset_save_dir)
#数据路径不存在就重新生成,如果存在,就跳过这部分。
# 因此需要额外注意如果有新数据需要把dataset下的文件夹都删除相当于删除缓存重新生成
for i in range(self.raw_labels.shape[0]):
start_time,end_time=self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10),self.raw_labels.iloc[i]["TSC_end_time"]
timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
logging.debug(f"{self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10)},{self.raw_labels.iloc[i]["TSC_end_time"]}")
if raw_spectral_data is not None:
#获取到的数据帧率大于2才开始记录
if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
# raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
np.savez(os.path.join(pre_dataset_save_dir,f"{self.raw_labels.iloc[i]["Furnace_Number"]}.npz",),furnaceNumber=self.raw_labels.iloc[i]["Furnace_Number"],measureStartDatetime=self.raw_labels.iloc[i]["TSC_start_time"].strftime('%Y-%m-%d %H:%M:%S'),measureEndDatetime=self.raw_labels.iloc[i]["TSC_end_time"].strftime('%Y-%m-%d %H:%M:%S'),timestamps=timestamps,rawSpectralData=raw_spectral_data,rawLabels=self.raw_labels.iloc[i][self.labels_name].to_numpy(),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"])
# else:
# logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
if __name__=="__main__":
logging.basicConfig(level = logging.DEBUG)
raw_data_dir="/data/SEMS-model-training/old_rawdata"
labels_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx"
dataset_dir="/data/SEMS-model-training/dataset"
labels_name=["TSC_T","TSC_C","TSC_P","TSC_S","TSC_Mn","TSC_Ni","TSC_Mo","TSC_Cr"]
# from choose_frame_spatial.mean import ChooseFrameSpatial
# from features_scaling.max_min import FeatureScaling
# from labels_scaling.max_min import LabelScaling
# choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
# features_scaling=FeatureScaling()
# labels_scaling=LabelScaling()
data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir)