stable version 准备接入不同设备训练不同模型
This commit is contained in:
parent
cfac5fd675
commit
6e333fe733
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,5 +1,4 @@
|
||||
dataset
|
||||
rawdata
|
||||
old_rawdata
|
||||
labels
|
||||
logs
|
||||
|
||||
|
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,5 +1,15 @@
|
||||
# Change Log
|
||||
|
||||
## v0.1.0.20240910_alpha
|
||||
## v0.0.2.20241015_alpha
|
||||
|
||||
# [⭐️Features]
|
||||
|
||||
- 从 MySQL 数据库获取数据
|
||||
|
||||
# [🔄Changed]
|
||||
|
||||
- 变更数据处理流程,先从数据库获取数据划分训练集测试集,得到中间文件,再进行选择空间点等后续处理
|
||||
|
||||
## v0.0.1.20240910_alpha
|
||||
|
||||
- 跑通
|
15
TODO.md
15
TODO.md
@ -0,0 +1,15 @@
|
||||
-[ ] 调研所有可调整超参数
|
||||
|
||||
-[ ] 调研所有回归任务的 loss 设计
|
||||
|
||||
-[ ] 调研所有可以用以回归任务的模型架构,及其可调整参数
|
||||
|
||||
-[ ] 调研跳出局部最优解的方法
|
||||
|
||||
-[x] 将数据库中的数据转化成训练集,验证集,测试集
|
||||
|
||||
-[x] 将训练集,验证集用不同的预处理、模型、超参数得到最优的模型
|
||||
|
||||
-[x] 将最优模型极其对应的超参数保存,并验证测试集得到测试误差
|
||||
|
||||
-[x] 不同的参数使用不同的模型
|
@ -5,16 +5,16 @@ defaults:
|
||||
|
||||
task_name: main
|
||||
|
||||
raw_spectral_data_dir: /code/admin/20240806-NanEr-5-8-data/rawdata
|
||||
raw_labels_dir: /code/admin/20240806-NanEr-5-8-data/labels/NanEr
|
||||
dataset_dir: /code/admin/20240806-NanEr-5-8-data/dataset
|
||||
labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
|
||||
# raw_spectral_data_dir: /data/SEMS-model-training/rawdata
|
||||
# raw_labels_dir: /data/SEMS-model-training/labels/NanEr
|
||||
dataset_dir: /data/SEMS-model-training/dataset
|
||||
# labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
|
||||
#是
|
||||
|
||||
|
||||
dataset:
|
||||
train_ratio: 0.8
|
||||
validate_ratio: 0.1
|
||||
num_worker: 128
|
||||
# train_ratio: 0.8
|
||||
# validate_ratio: 0.1
|
||||
num_worker: 0
|
||||
train:
|
||||
max_epoch: 10000
|
||||
max_epoch: 5000
|
||||
checkpoint_interval: 100
|
||||
|
@ -1,40 +1,37 @@
|
||||
run:
|
||||
num_samples: 1
|
||||
resources_per_trial:
|
||||
cpu: 128
|
||||
gpu: 1
|
||||
cpu: 4
|
||||
gpu: 0.1
|
||||
scheduler:
|
||||
_target_: ray.tune.schedulers.ASHAScheduler
|
||||
metric: val_loss
|
||||
mode: min
|
||||
max_t: 20000
|
||||
grace_period: 1
|
||||
reduction_factor: 2
|
||||
_target_: ray.tune.schedulers.FIFOScheduler
|
||||
# _target_: ray.tune.schedulers.ASHAScheduler
|
||||
# metric: val_loss
|
||||
# mode: min
|
||||
# max_t: 20000
|
||||
# grace_period: 1
|
||||
# reduction_factor: 2
|
||||
config:
|
||||
choose_frame_spatial:
|
||||
_target_: ray.tune.grid_search
|
||||
values:
|
||||
- _target_: data.choose_frame_spatial.mean.ChooseFrameSpatial
|
||||
interval: [-3,0]
|
||||
|
||||
- _target_: data.choose_frame_spatial.DBSCAN.ChooseFrameSpatial ##DBSCAN_data_augmentation
|
||||
|
||||
features_scaling:
|
||||
_target_: ray.tune.grid_search
|
||||
values:
|
||||
- _target_: data.features_scaling.max_min.FeatureScaling
|
||||
|
||||
- _target_: data.features_scaling.standardization.FeatureScaling #
|
||||
|
||||
labels_scaling:
|
||||
_target_: ray.tune.grid_search
|
||||
values:
|
||||
- _target_: data.labels_scaling.max_min.LabelScaling
|
||||
- _target_: data.labels_scaling.standardization.LabelScaling #standardization
|
||||
|
||||
model:
|
||||
_target_: ray.tune.grid_search
|
||||
values:
|
||||
# - _target_: model.FCNN.FCNN
|
||||
# - _target_: model.CNN_LSTM_FCNN.CNN_LSTM_FCNN
|
||||
- _target_: model.CNN1D_FCNN.CNN1D_FCNN
|
||||
- _target_: model.DRSN-CW.SpectralModel
|
||||
# - _target_: model.FCNN_DBSCAN_small.SpectralModel
|
||||
criterion:
|
||||
_target_: ray.tune.grid_search
|
||||
values:
|
||||
@ -42,10 +39,15 @@ config:
|
||||
optimizer:
|
||||
_target_: ray.tune.grid_search
|
||||
values:
|
||||
- _target_: torch.optim.Adam
|
||||
_partial_: true
|
||||
lr: 0.000001
|
||||
- _target_: torch.optim.Adam
|
||||
_partial_: true
|
||||
lr: 0.0001
|
||||
|
||||
batch_size:
|
||||
_target_: ray.tune.grid_search
|
||||
values:
|
||||
- 1024
|
||||
- 128
|
Binary file not shown.
BIN
src/data/__pycache__/pre_process_backup.cpython-312.pyc
Normal file
BIN
src/data/__pycache__/pre_process_backup.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
142
src/data/choose_frame_spatial/DBSCAN.py
Normal file
142
src/data/choose_frame_spatial/DBSCAN.py
Normal file
@ -0,0 +1,142 @@
|
||||
'''
|
||||
@File : mean.py
|
||||
@Time : 2024/08/12 13:53:36
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征
|
||||
'''
|
||||
|
||||
|
||||
import datetime
|
||||
import numpy as np
|
||||
from sklearn.cluster import DBSCAN
|
||||
from pathlib import Path
|
||||
|
||||
class ChooseFrameSpatial:
|
||||
def __init__(self,eps=0.15,min_samples=10):
|
||||
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
|
||||
"""
|
||||
self.eps=eps
|
||||
self.min_samples=min_samples
|
||||
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改
|
||||
|
||||
self.file_path = Path(__file__).absolute()
|
||||
|
||||
|
||||
|
||||
def load_state_dict(self,state):
|
||||
self.eps=state["eps"]
|
||||
self.min_samples=state["min_samples"]
|
||||
def state_dict(self):
|
||||
return {"eps":self.eps,"min_samples":self.min_samples}
|
||||
|
||||
|
||||
|
||||
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
"""
|
||||
获取指定时间的内的光谱数据
|
||||
"""
|
||||
|
||||
if isinstance(measureStartDatetime.item(),str):
|
||||
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间(此项目中时间戳均以ms为单位)
|
||||
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000 #测量开始时间+3秒
|
||||
else:
|
||||
start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000
|
||||
end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000
|
||||
|
||||
start_index=0
|
||||
end_index=timestamps.shape[0]
|
||||
for i in range(1,timestamps.shape[0]):
|
||||
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
|
||||
# print(f"开始索引为{i}")
|
||||
start_index=i
|
||||
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
|
||||
# print(f"结束索引为{i}")
|
||||
end_index=i
|
||||
|
||||
|
||||
return rawdata[start_index:end_index,...]
|
||||
|
||||
|
||||
|
||||
|
||||
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
|
||||
rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
|
||||
|
||||
|
||||
rawSpectralData=rawSpectralData.transpose(0, 2, 1)
|
||||
rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
|
||||
|
||||
rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
|
||||
db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed)
|
||||
|
||||
labels_norm = db_norm.labels_
|
||||
n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
|
||||
|
||||
|
||||
if n_norm==0:
|
||||
return None
|
||||
|
||||
max_i=0
|
||||
max_num=0
|
||||
for i in range(n_norm):
|
||||
tmp=(labels_norm==i).sum()
|
||||
if tmp>max_num:
|
||||
max_i=i
|
||||
max_num=tmp
|
||||
|
||||
|
||||
selected_data=rawSpectralData_normed[labels_norm==max_i,:]
|
||||
selected_data=np.mean(selected_data,axis=0)
|
||||
|
||||
# space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
|
||||
# space_num= 10 #空间上平均光强合适,且方差最小的10个点
|
||||
# light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
|
||||
# & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
|
||||
# )
|
||||
# if light_indices[0].shape[0] < space_num:
|
||||
# print('No Enough Usable Space Point Found!')
|
||||
# return None
|
||||
|
||||
# intensity_data = selected_data[:,:,light_indices[0]]
|
||||
|
||||
# # spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
|
||||
# # top_10_indices = np.argsort(spatial_variance)[:space_num]
|
||||
# space_selected_data = intensity_data[:,:,:]
|
||||
|
||||
# space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0)
|
||||
|
||||
# space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data))
|
||||
|
||||
# # Savitzky-Golay filter
|
||||
# data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0)
|
||||
|
||||
# spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11,
|
||||
# 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41,
|
||||
# 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31,
|
||||
# 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140,
|
||||
# 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
|
||||
|
||||
|
||||
# selected_data = data_filtered[spec_indices]
|
||||
|
||||
|
||||
return selected_data
|
||||
|
||||
if __name__ =="__main__":
|
||||
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True)
|
||||
print(raw_data.keys())
|
||||
|
||||
print(f"炉次号\t:{raw_data["furnaceNumber"]}")
|
||||
print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}")
|
||||
print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}")
|
||||
print(f"时间戳维度\t:{raw_data["timestamps"].shape}")
|
||||
print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}")
|
||||
|
||||
tmp=ChooseFrameSpatial()
|
||||
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
|
||||
|
||||
|
||||
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
|
102
src/data/choose_frame_spatial/DBSCAN_data_augmentation.py
Normal file
102
src/data/choose_frame_spatial/DBSCAN_data_augmentation.py
Normal file
@ -0,0 +1,102 @@
|
||||
'''
|
||||
@File : mean.py
|
||||
@Time : 2024/08/12 13:53:36
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征
|
||||
'''
|
||||
|
||||
|
||||
import datetime
|
||||
import numpy as np
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from scipy.signal import savgol_filter
|
||||
from sklearn.cluster import DBSCAN
|
||||
|
||||
class ChooseFrameSpatial:
|
||||
def __init__(self,eps=0.15,min_samples=10):
|
||||
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
|
||||
"""
|
||||
self.eps=eps
|
||||
self.min_samples=min_samples
|
||||
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改
|
||||
|
||||
|
||||
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
"""
|
||||
获取指定时间的内的光谱数据
|
||||
"""
|
||||
|
||||
if isinstance(measureStartDatetime.item(),str):
|
||||
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间(此项目中时间戳均以ms为单位)
|
||||
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000 #测量开始时间+3秒
|
||||
else:
|
||||
start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000
|
||||
end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000
|
||||
|
||||
start_index=0
|
||||
end_index=timestamps.shape[0]
|
||||
for i in range(1,timestamps.shape[0]):
|
||||
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
|
||||
# print(f"开始索引为{i}")
|
||||
start_index=i
|
||||
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
|
||||
# print(f"结束索引为{i}")
|
||||
end_index=i
|
||||
|
||||
|
||||
return rawdata[start_index:end_index,...]
|
||||
|
||||
|
||||
|
||||
|
||||
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
|
||||
rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
|
||||
|
||||
|
||||
rawSpectralData=rawSpectralData.transpose(0, 2, 1)
|
||||
rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
|
||||
|
||||
rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
|
||||
db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed)
|
||||
|
||||
labels_norm = db_norm.labels_
|
||||
n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
|
||||
|
||||
|
||||
if n_norm==0:
|
||||
return None
|
||||
|
||||
max_i=0
|
||||
max_num=0
|
||||
for i in range(n_norm):
|
||||
tmp=(labels_norm==i).sum()
|
||||
if tmp>max_num:
|
||||
max_i=i
|
||||
max_num=tmp
|
||||
|
||||
|
||||
selected_data=rawSpectralData_normed[labels_norm==max_i,:]
|
||||
# selected_data=np.mean(selected_data,axis=0)
|
||||
|
||||
|
||||
|
||||
return selected_data
|
||||
|
||||
if __name__ =="__main__":
|
||||
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True)
|
||||
print(raw_data.keys())
|
||||
|
||||
print(f"炉次号\t:{raw_data["furnaceNumber"]}")
|
||||
print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}")
|
||||
print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}")
|
||||
print(f"时间戳维度\t:{raw_data["timestamps"].shape}")
|
||||
print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}")
|
||||
|
||||
tmp=ChooseFrameSpatial()
|
||||
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
|
||||
|
||||
|
||||
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
|
BIN
src/data/choose_frame_spatial/__pycache__/DBSCAN.cpython-312.pyc
Normal file
BIN
src/data/choose_frame_spatial/__pycache__/DBSCAN.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,100 +0,0 @@
|
||||
'''
|
||||
@File : mean.py
|
||||
@Time : 2024/08/12 13:53:36
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import datetime
|
||||
import numpy as np
|
||||
class ChooseFrameSpatial:
|
||||
def __init__(self, interval=[-3,0]):
|
||||
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
|
||||
|
||||
Args:
|
||||
interval (list, optional): 此方法依赖的光谱时间范围,默认[-30,30],即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
|
||||
"""
|
||||
self.description=f"{str(self.__class__).split(".")[2]}_{interval[0]}_{interval[1]}"
|
||||
# print(self.description)
|
||||
self.interval=interval
|
||||
|
||||
def get_data_interval(self, start_time, end_time ):
|
||||
|
||||
return start_time+datetime.timedelta(seconds=self.interval[0]),start_time+datetime.timedelta(seconds=self.interval[1])
|
||||
|
||||
def run(self,timestamps,rawdata):
|
||||
############# 空间数据压缩 ######################
|
||||
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
|
||||
|
||||
space_num= 10 #选取方差最小的点的个数
|
||||
|
||||
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
|
||||
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
|
||||
)
|
||||
|
||||
if light_indices[0].shape[0] < space_num:
|
||||
print('No Enough Usable Space Point Found!')
|
||||
return None
|
||||
|
||||
intensity_data = rawdata[:,:,light_indices[0]]
|
||||
|
||||
spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
|
||||
top_10_indices = np.argsort(spatial_variance)[:space_num]
|
||||
space_selected_data = intensity_data[:, :, top_10_indices]
|
||||
|
||||
############# 时间数据压缩 ######################
|
||||
# framerate = 24
|
||||
# timewindow = 2
|
||||
|
||||
# time_data_intensity = np.sum(np.mean(space_selected_data,axis=2),axis=1)
|
||||
|
||||
# min_var = float('inf')
|
||||
# min_index = 0
|
||||
|
||||
# for i in range(len(time_data_intensity)-framerate*timewindow-1):
|
||||
# window_var = np.var(time_data_intensity[i:i+framerate*timewindow])
|
||||
# if window_var < min_var:
|
||||
# min_var = window_var
|
||||
# min_index = i
|
||||
|
||||
# selected_data = space_selected_data[min_index:min_index+framerate*timewindow,:,:]
|
||||
|
||||
selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
|
||||
# print("timewindow_begin=",min_index)
|
||||
|
||||
# if (result_dir is not None) and (filenum is not None) :
|
||||
|
||||
# for i in range (selected_data.shape[2]):
|
||||
|
||||
# Z=selected_data[:,:,i]
|
||||
# x=np.linspace(400,1000,Z.shape[1])
|
||||
# y=selected_data[:,1,i]
|
||||
|
||||
# x, y = np.meshgrid(x, y)
|
||||
|
||||
# fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
|
||||
|
||||
# surf = ax.plot_surface(x, y, Z, cmap=cm.Blues,
|
||||
# linewidth=0, antialiased=False)
|
||||
|
||||
# ax.view_init(elev=30, azim=-60) # 更改视角
|
||||
|
||||
# ax.set_zlim(0, 4095)
|
||||
# ax.set_zlabel("Light Intensity")
|
||||
# ax.set_xlabel("Spectral Band(nm)")
|
||||
# ax.set_ylabel("Time (Serial Number)")
|
||||
# plt.savefig(f"{result_dir}{ filenum} file {i}-th spatial_point_line.png")
|
||||
# plt.close()
|
||||
|
||||
return selected_data
|
||||
|
||||
if __name__ =="__main__":
|
||||
timpestamp=np.zeros((600,))
|
||||
input_data=np.random.random((600,224,512))*4095
|
||||
tmp=ChooseFrameSpatial()
|
||||
output=tmp.run(timpestamp,input_data)
|
||||
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
|
125
src/data/choose_frame_spatial/mean_CARS100_3.py
Normal file
125
src/data/choose_frame_spatial/mean_CARS100_3.py
Normal file
@ -0,0 +1,125 @@
|
||||
'''
|
||||
@File : mean.py
|
||||
@Time : 2024/08/12 13:53:36
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import datetime
|
||||
import numpy as np
|
||||
from sklearn.cross_decomposition import PLSRegression
|
||||
from scipy.signal import savgol_filter
|
||||
from scipy.ndimage import median_filter
|
||||
|
||||
class ChooseFrameSpatial:
|
||||
def __init__(self):
|
||||
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
|
||||
"""
|
||||
|
||||
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改
|
||||
|
||||
print(self.description)
|
||||
|
||||
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
"""
|
||||
获取指定时间的内的光谱数据
|
||||
"""
|
||||
|
||||
|
||||
if isinstance(measureStartDatetime.item(),str):
|
||||
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 #测量开始时间(此项目中时间戳均以ms为单位)
|
||||
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 3000 #测量开始时间+3秒
|
||||
else:
|
||||
start_timestamp=measureStartDatetime.item().timestamp()*1000
|
||||
end_timestamp=measureStartDatetime.item().timestamp()*1000 + 3000
|
||||
|
||||
start_index=0
|
||||
end_index=timestamps.shape[0]
|
||||
for i in range(1,timestamps.shape[0]):
|
||||
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
|
||||
# print(f"开始索引为{i}")
|
||||
start_index=i
|
||||
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
|
||||
# print(f"结束索引为{i}")
|
||||
end_index=i
|
||||
|
||||
|
||||
return rawdata[start_index:end_index,...]
|
||||
|
||||
|
||||
|
||||
|
||||
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
|
||||
selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
|
||||
|
||||
|
||||
|
||||
space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
|
||||
space_num= 10 #空间上平均光强合适,且方差最小的10个点
|
||||
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
|
||||
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
|
||||
)
|
||||
if light_indices[0].shape[0] < space_num:
|
||||
print('No Enough Usable Space Point Found!')
|
||||
return None
|
||||
|
||||
intensity_data = selected_data[:,:,light_indices[0]]
|
||||
|
||||
# spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
|
||||
# top_10_indices = np.argsort(spatial_variance)[:space_num]
|
||||
|
||||
time_window = 11
|
||||
time_steps, wavelengths, spatial_points = intensity_data.shape
|
||||
filtered_data = np.zeros_like(intensity_data)
|
||||
for wavelength in range(wavelengths):
|
||||
for spatial_point in range(spatial_points):
|
||||
# 提取时间序列
|
||||
time_series = intensity_data[:, wavelength, spatial_point]
|
||||
# 应用中值滤波
|
||||
filtered_time_series = median_filter(time_series, size=time_window)
|
||||
# 将滤波后的时间序列放回原位置
|
||||
filtered_data[:, wavelength, spatial_point] = filtered_time_series
|
||||
|
||||
|
||||
space_selected_data = filtered_data[:,:,:]
|
||||
|
||||
space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0)
|
||||
|
||||
# space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data))
|
||||
|
||||
# # Savitzky-Golay filter
|
||||
# data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0)
|
||||
|
||||
# spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11,
|
||||
# 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41,
|
||||
# 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31,
|
||||
# 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140,
|
||||
# 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
|
||||
|
||||
|
||||
# selected_data = data_filtered[spec_indices]
|
||||
|
||||
# print(space_selected_data.shape)
|
||||
return space_selected_data
|
||||
# return selected_data
|
||||
|
||||
if __name__ =="__main__":
|
||||
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True)
|
||||
print(raw_data.keys())
|
||||
|
||||
print(f"炉次号\t:{raw_data["furnaceNumber"]}")
|
||||
print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}")
|
||||
print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}")
|
||||
print(f"时间戳维度\t:{raw_data["timestamps"].shape}")
|
||||
print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}")
|
||||
|
||||
tmp=ChooseFrameSpatial()
|
||||
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
|
||||
|
||||
|
||||
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
|
83
src/data/choose_frame_spatial/mean_chose_top_10_spatial.py
Normal file
83
src/data/choose_frame_spatial/mean_chose_top_10_spatial.py
Normal file
@ -0,0 +1,83 @@
|
||||
'''
|
||||
@File : mean.py
|
||||
@Time : 2024/08/12 13:53:36
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import datetime
|
||||
import numpy as np
|
||||
|
||||
class ChooseFrameSpatial:
|
||||
def __init__(self,):
|
||||
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
|
||||
"""
|
||||
|
||||
|
||||
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改
|
||||
|
||||
print(self.description)
|
||||
|
||||
def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
"""
|
||||
获取指定时间的内的光谱数据
|
||||
"""
|
||||
start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间-1秒(此项目中时间戳均以ms为单位)
|
||||
end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 +1000 #测量开始时间+1秒
|
||||
|
||||
for i in range(1,timestamps.shape[0]):
|
||||
if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
|
||||
# print(f"开始索引为{i}")
|
||||
start_index=i
|
||||
if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
|
||||
# print(f"结束索引为{i}")
|
||||
end_index=i
|
||||
|
||||
return rawdata[start_index:end_index,...]
|
||||
|
||||
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
|
||||
selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
|
||||
|
||||
|
||||
|
||||
|
||||
space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
|
||||
space_num= 10 #空间上平均光强合适,且方差最小的10个点
|
||||
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
|
||||
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
|
||||
)
|
||||
if light_indices[0].shape[0] < space_num:
|
||||
print('No Enough Usable Space Point Found!')
|
||||
return None
|
||||
|
||||
intensity_data = selected_data[:,:,light_indices[0]]
|
||||
|
||||
spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
|
||||
top_10_indices = np.argsort(spatial_variance)[:space_num]
|
||||
selected_data = intensity_data[:, :, top_10_indices]
|
||||
|
||||
|
||||
|
||||
|
||||
return selected_data
|
||||
|
||||
if __name__ =="__main__":
|
||||
raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True)
|
||||
print(raw_data.keys())
|
||||
|
||||
print(f"炉次号\t:{raw_data["furnaceNumber"]}")
|
||||
print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}")
|
||||
print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}")
|
||||
print(f"时间戳维度\t:{raw_data["timestamps"].shape}")
|
||||
print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}")
|
||||
|
||||
tmp=ChooseFrameSpatial()
|
||||
output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
|
||||
|
||||
|
||||
print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
|
@ -1,132 +0,0 @@
|
||||
'''
|
||||
@File : mean.py
|
||||
@Time : 2024/08/12 13:53:36
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征
|
||||
'''
|
||||
|
||||
|
||||
|
||||
|
||||
import datetime
|
||||
import numpy as np
|
||||
class ChooseFrameSpatial:
|
||||
def __init__(self, interval=[-30,30],overexposion_threshold = 1,weight_mean_var = 0.5,space_num= 10, time_num = 9,framerate = 4,timewindow_time = 1,Type=6):
|
||||
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
|
||||
|
||||
Args:
|
||||
interval (list, optional): 此方法依赖的光谱时间范围,默认[-30,30],即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
|
||||
"""
|
||||
# print(str(self.__class__).split("."))
|
||||
self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
|
||||
|
||||
# print(self.description)
|
||||
|
||||
self.overexposion_threshold = overexposion_threshold #过曝阈值:有多少时间点过曝时,判定为这个空间点过曝
|
||||
self.weight_mean_var =weight_mean_var #强度/方差选取权重,默认平均光强权重为1,此权重为方差权重,取值为[0,∞)
|
||||
self.space_num= space_num #选取空间点数量
|
||||
self.time_num = time_num #选取时间点数量
|
||||
self.framerate = framerate #采样帧率
|
||||
self.timewindow = framerate * timewindow_time #选取时间窗口为1s
|
||||
|
||||
|
||||
self.interval=interval
|
||||
self.Type=Type
|
||||
|
||||
def get_data_interval(self, start_time, end_time ):
|
||||
|
||||
return start_time+datetime.timedelta(seconds=self.interval[0]),end_time+datetime.timedelta(seconds=self.interval[1])
|
||||
|
||||
def Overexposed_spatial_point_detection(self,inputdata, timethreshold):
|
||||
#判别一个空间点是否过曝,inputdata是(time,wavelength)二维数组,timethreshold是时间阈值,有多少个时间点过曝则认为这个点过曝
|
||||
# 如果过曝返回False,不过曝返回True
|
||||
|
||||
row_max_values= np.max(inputdata, axis=1)
|
||||
overexposed_rows = np.sum(row_max_values >= 4095)
|
||||
if overexposed_rows > timethreshold:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
def run(self,timestamps,rawdata):
|
||||
# 降维方法:空间上选取强度达到一定的阈值且方差最小的点,时间上把整个时间段划分并选取各段权重最高的1s进行平均,Type=1取前1/3,Type=2取中部1/3,Type=3取最后1/3,Type=4取全部
|
||||
|
||||
|
||||
############# 空间过曝点去除 ######################
|
||||
unoverposed_indices = [i for i in range(rawdata.shape[2]) if self.Overexposed_spatial_point_detection(rawdata[:, :, i],self.overexposion_threshold)]
|
||||
rawdata = rawdata[:,:,unoverposed_indices]
|
||||
|
||||
############# 空间数据压缩 ######################
|
||||
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
|
||||
space_intensity_var = np.var(np.sum(rawdata, axis=1), axis=0)
|
||||
combined_score = (space_intensity_mean - np.min(space_intensity_mean)) / (np.max(space_intensity_mean) - np.min(space_intensity_mean)) \
|
||||
- self.weight_mean_var * (space_intensity_var - np.min(space_intensity_var)) / (np.max(space_intensity_var) - np.min(space_intensity_var))
|
||||
|
||||
sorted_indices = np.argsort(combined_score)[::-1]
|
||||
space_selected_data = rawdata[:,:,sorted_indices[:self.space_num]]
|
||||
|
||||
############# 时间数据压缩 ######################
|
||||
space_selected_data_intensity = np.sum(space_selected_data,axis=1)
|
||||
|
||||
#按照时间把强度数据拆分成num_time份,每一份维度为(总时间点数/time_num, space_num)
|
||||
time_length = space_selected_data_intensity.shape[0]
|
||||
chunk_size = time_length // self.time_num
|
||||
remainder = time_length % self.time_num
|
||||
start_index = 0
|
||||
|
||||
time_selected_data =None
|
||||
|
||||
for i in range(self.time_num):
|
||||
end_index = start_index + chunk_size + (1 if i < remainder else 0)
|
||||
chunk = space_selected_data_intensity[start_index:end_index, :]
|
||||
|
||||
window_var = np.empty(0)
|
||||
window_mean = np.empty(0)
|
||||
|
||||
for j in range(len(chunk)-self.timewindow-1):
|
||||
window_var = np.concatenate((window_var, np.expand_dims( np.sum(np.var(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
|
||||
window_mean = np.concatenate((window_mean, np.expand_dims( np.sum(np.mean(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
|
||||
|
||||
combined_score_time = (window_mean - np.min(window_mean)) / (np.max(window_mean) - np.min(window_mean)) \
|
||||
- self.weight_mean_var * (window_var - np.min(window_var)) / (np.max(window_var) - np.min(window_var))
|
||||
sorted_indices = np.argsort(combined_score_time)[::-1]
|
||||
|
||||
time_window_data = np.mean(space_selected_data[start_index+sorted_indices[0]:start_index+sorted_indices[0]+self.timewindow, :, :],axis=0)
|
||||
# print(time_selected_data.shape,time_window_data.shape,np.expand_dims( time_window_data,0).shape)
|
||||
if time_selected_data is None:
|
||||
|
||||
time_selected_data=np.expand_dims( time_window_data,0)
|
||||
else:
|
||||
time_selected_data = np.concatenate((time_selected_data, np.expand_dims( time_window_data,0)), axis=0)
|
||||
|
||||
start_index = end_index
|
||||
|
||||
if self.Type == 1:
|
||||
print("time_selected_data[前1/3].shape: ", time_selected_data[:self.time_num//3,:,:].shape)
|
||||
return time_selected_data[:self.time_num//3,:,:]
|
||||
elif self.Type == 2:
|
||||
print("time_selected_data[中1/3].shape: ", time_selected_data[self.time_num//3:2*self.time_num//3,:,:].shape)
|
||||
return time_selected_data[self.time_num//3:2*self.time_num//3,:,:]
|
||||
elif self.Type == 3:
|
||||
print("time_selected_data[后1/3].shape: ", time_selected_data[2*self.time_num//3:,:,:].shape)
|
||||
return time_selected_data[2*self.time_num//3:,:,:]
|
||||
elif self.Type == 4:
|
||||
print("time_selected_data[前2/3].shape: ", time_selected_data[:2*self.time_num//3,:,:].shape)
|
||||
return time_selected_data[:2*self.time_num//3,:,:]
|
||||
elif self.Type == 5:
|
||||
print("time_selected_data[后2/3].shape: ", time_selected_data[self.time_num//3:,:,:].shape)
|
||||
return time_selected_data[self.time_num//3:,:,:]
|
||||
elif self.Type == 6:
|
||||
print("time_selected_data.shape: ", time_selected_data.shape)
|
||||
return time_selected_data
|
||||
else:
|
||||
print("Type is not 1, 2, 3, 4, 5 or 6 !")
|
||||
return None
|
||||
|
||||
if __name__ =="__main__":
|
||||
timpestamp=np.zeros((600,))
|
||||
input_data=np.random.random((600,224,512))*4095
|
||||
tmp=ChooseFrameSpatial()
|
||||
output=tmp.run(timpestamp,input_data)
|
||||
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
|
78
src/data/choose_frame_spatial/meanwovar_CARS.py
Normal file
78
src/data/choose_frame_spatial/meanwovar_CARS.py
Normal file
@ -0,0 +1,78 @@
|
||||
'''
|
||||
@File : mean.py
|
||||
@Time : 2024/08/12 13:53:36
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import datetime
|
||||
import numpy as np
|
||||
from scipy.signal import savgol_filter
|
||||
|
||||
class ChooseFrameSpatial:
|
||||
def __init__(self,):
|
||||
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
|
||||
|
||||
Args:
|
||||
interval (list, optional): 此方法依赖的光谱时间范围,默认[-30,30],即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# tmp=
|
||||
# if len(tmp)>2:
|
||||
# self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
|
||||
self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}"
|
||||
|
||||
print(self.description)
|
||||
# print(self.description)
|
||||
|
||||
|
||||
|
||||
def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
|
||||
############# 空间数据压缩 ######################
|
||||
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
|
||||
|
||||
space_num= 10 #选取方差最小的点的个数
|
||||
|
||||
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
|
||||
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
|
||||
)
|
||||
|
||||
if light_indices[0].shape[0] < space_num:
|
||||
print('No Enough Usable Space Point Found!')
|
||||
return None
|
||||
|
||||
intensity_data = rawdata[:,:,light_indices[0]]
|
||||
|
||||
spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
|
||||
top_10_indices = np.argsort(spatial_variance)[:space_num]
|
||||
space_selected_data = intensity_data[:, :, top_10_indices]
|
||||
|
||||
|
||||
selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
|
||||
|
||||
selected_data = savgol_filter(selected_data, window_length=11, polyorder=2, axis=0)
|
||||
|
||||
top_features = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136,
|
||||
11, 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204,
|
||||
85, 104, 41, 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18,
|
||||
17, 96, 167, 192, 201, 31, 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51,
|
||||
147, 58, 182, 49, 119, 13, 179, 140, 105, 45, 55, 33, 73, 111, 97, 194, 121, 89,
|
||||
38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
|
||||
selected_data=selected_data[top_features]
|
||||
|
||||
|
||||
|
||||
return selected_data
|
||||
|
||||
if __name__ =="__main__":
|
||||
timpestamp=np.zeros((600,))
|
||||
input_data=np.random.random((56,224,512))*4095
|
||||
tmp=ChooseFrameSpatial()
|
||||
output=tmp.run(0,0,timpestamp,input_data)
|
||||
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
|
BIN
src/data/features_scaling/__pycache__/do_nothing.cpython-312.pyc
Normal file
BIN
src/data/features_scaling/__pycache__/do_nothing.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
21
src/data/features_scaling/do_nothing.py
Normal file
21
src/data/features_scaling/do_nothing.py
Normal file
@ -0,0 +1,21 @@
|
||||
'''
|
||||
@File : max_min.py
|
||||
@Time : 2024/08/12 14:02:59
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 对输入特征进行标准化放缩
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import numpy as np
|
||||
class FeatureScaling:
|
||||
def __init__(self):
|
||||
self.description=f"{str(self.__class__).split(".")[-2]}"
|
||||
self.flag_get_global_info=False
|
||||
|
||||
|
||||
def run( self,feature):
|
||||
|
||||
return feature
|
23
src/data/features_scaling/frame_max_min.py
Normal file
23
src/data/features_scaling/frame_max_min.py
Normal file
@ -0,0 +1,23 @@
|
||||
'''
|
||||
@File : max_min.py
|
||||
@Time : 2024/08/12 14:02:59
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 对输入特征进行标准化放缩
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import numpy as np
|
||||
class FeatureScaling:
|
||||
def __init__(self):
|
||||
self.description=f"{str(self.__class__).split(".")[-2]}"
|
||||
self.flag_get_global_info=False
|
||||
|
||||
|
||||
|
||||
def run( self,feature):
|
||||
|
||||
feature=(feature-feature.min())/(feature.max()-feature.min())
|
||||
return feature
|
72
src/data/features_scaling/standardization.py
Normal file
72
src/data/features_scaling/standardization.py
Normal file
@ -0,0 +1,72 @@
|
||||
'''
|
||||
@File : max_min.py
|
||||
@Time : 2024/08/12 14:03:38
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 对输出标签进行标准化放缩
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
class FeatureScaling:
|
||||
def __init__(self):
|
||||
self.description=f"{str(self.__class__).split(".")[-2]}"
|
||||
self.flag_get_global_info=True
|
||||
self._min=None
|
||||
self._max=None
|
||||
self._mean=None
|
||||
self._var=None
|
||||
self._num=0
|
||||
|
||||
self.file_path = Path(__file__).absolute()
|
||||
|
||||
|
||||
|
||||
def load_state_dict(self,state):
|
||||
self._min=state["_min"]
|
||||
self._max=state["_max"]
|
||||
self._mean=state["_mean"]
|
||||
self._var=state["_var"]
|
||||
self._num=state["_num"]
|
||||
def state_dict(self):
|
||||
return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num}
|
||||
|
||||
def get_global_info(self,feature):
|
||||
feature=feature.astype(np.float32)
|
||||
if self._max is None:
|
||||
self._min=np.zeros(feature.shape,dtype=np.float32)+999999999
|
||||
self._max=np.zeros(feature.shape,dtype=np.float32)
|
||||
self._mean=np.zeros(feature.shape,dtype=np.float32)
|
||||
self._var=np.zeros(feature.shape,dtype=np.float32)
|
||||
self._num=0
|
||||
self.feature_shape=feature.shape
|
||||
|
||||
self._min=np.minimum(self._min,feature)
|
||||
self._max=np.maximum(self._max,feature)
|
||||
|
||||
if self._num>0:
|
||||
self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(feature-self._mean)*(feature-self._mean)
|
||||
#要先更新var再更新mean
|
||||
self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*feature
|
||||
self._num =self._num+1
|
||||
|
||||
# print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max)
|
||||
|
||||
|
||||
def run(self,feature):
|
||||
# for i in range(label.shape[0]):
|
||||
# label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
|
||||
feature=(feature-self._mean)/np.sqrt(self._var)
|
||||
|
||||
return feature
|
||||
# def reverse(self,feature):
|
||||
# # for i in range(labels.shape[1]):
|
||||
# # labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
|
||||
|
||||
# labels=labels*np.sqrt(self._var)+self._mean
|
||||
# return labels
|
||||
|
||||
|
202
src/data/generate_raw_dataset_from_MySQL.py
Normal file
202
src/data/generate_raw_dataset_from_MySQL.py
Normal file
@ -0,0 +1,202 @@
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import pymysql.cursors
|
||||
|
||||
import numpy as np
|
||||
import gzip
|
||||
import msgpack
|
||||
|
||||
# oldRawDatasetDir=
|
||||
|
||||
def getLatestData(dataNum,deviceId='DOJHBG',measureType="TSC"):
|
||||
connection = pymysql.connect(host='localhost',
|
||||
user='root',
|
||||
password='Ghs@2211',
|
||||
database='SEMS',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
with connection:
|
||||
with connection.cursor() as cursor:
|
||||
# 定义 SQL 查询
|
||||
sql = """
|
||||
SELECT
|
||||
f.furnaceNumber,
|
||||
f.measureStartDatetime,
|
||||
f.measureEndDatetime,
|
||||
f.Temperature,
|
||||
f.C,
|
||||
f.P,
|
||||
f.S,
|
||||
f.Mn,
|
||||
f.Ni,
|
||||
f.Mo,
|
||||
f.Cr,
|
||||
ot.spectralData,
|
||||
ot.spectralDim,
|
||||
ot.spatialDim
|
||||
FROM
|
||||
furnace f
|
||||
JOIN
|
||||
online_test ot ON f.furnaceId = ot.furnaceId
|
||||
WHERE
|
||||
f.deviceId = %s AND
|
||||
f.measureType = %s
|
||||
ORDER BY
|
||||
f.furnaceID DESC
|
||||
LIMIT %s;
|
||||
"""
|
||||
values=(deviceId,measureType,dataNum)
|
||||
cursor.execute(sql,values)
|
||||
result = cursor.fetchall()
|
||||
return result
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC"):
|
||||
connection = pymysql.connect(host='localhost',
|
||||
user='root',
|
||||
password='Ghs@2211',
|
||||
database='SEMS',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
dataNum=0
|
||||
with connection:
|
||||
with connection.cursor() as cursor:
|
||||
# 定义 SQL 查询
|
||||
sql = """
|
||||
SELECT
|
||||
COUNT(*)
|
||||
FROM
|
||||
furnace as f
|
||||
WHERE
|
||||
f.deviceId = %s AND
|
||||
f.measureType = %s
|
||||
"""
|
||||
values=(deviceId,measureType)
|
||||
|
||||
cursor.execute(sql,values)
|
||||
result = cursor.fetchall()
|
||||
dataNum=result[0]["COUNT(*)"]
|
||||
|
||||
sql = """
|
||||
SELECT furnaceNumber
|
||||
FROM furnace as f
|
||||
WHERE
|
||||
f.deviceId = %s AND
|
||||
f.measureType = %s
|
||||
ORDER BY furnaceId DESC
|
||||
LIMIT 1;
|
||||
"""
|
||||
values=(deviceId,measureType)
|
||||
|
||||
cursor.execute(sql,values)
|
||||
result = cursor.fetchall()
|
||||
lastesFurnaceNumber=result[0]["furnaceNumber"]
|
||||
return dataNum,lastesFurnaceNumber
|
||||
|
||||
|
||||
def saveDataset(caches, datasetDir,datasetType="train"):
|
||||
os.makedirs(datasetDir/datasetType)
|
||||
|
||||
for cache in caches:
|
||||
print(type(cache))
|
||||
if isinstance(cache,str):
|
||||
oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset")
|
||||
shutil.copy(oldRawDatasetDir/cache, datasetDir/datasetType)
|
||||
if isinstance(cache,dict):
|
||||
print(cache.keys())
|
||||
spectralData=msgpack.unpackb(gzip.decompress(cache["spectralData"]))
|
||||
# spectralData=msgpack.unpack(gzip.decompress(spectralDatawithTime["buffer"]))
|
||||
spectralDataNumpy=np.frombuffer(spectralData["buffer"],dtype=np.uint16).reshape(int(len(spectralData["buffer"])/cache["spectralDim"]/cache["spatialDim"]/2),cache["spectralDim"],cache["spatialDim"])
|
||||
|
||||
np.savez(datasetDir/datasetType/f"{cache["furnaceNumber"]}.npz",furnaceNumber=cache["furnaceNumber"],measureStartDatetime=cache["measureStartDatetime"],measureEndDatetime=cache["measureEndDatetime"],timestamps=spectralData["timestamps"],rawSpectralData=spectralDataNumpy,rawLabels=np.array([cache["Temperature"],cache["C"],cache["P"],cache["S"],cache["Mn"],cache["Ni"],cache["Mo"],cache["Cr"],]),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"])
|
||||
|
||||
|
||||
|
||||
# break
|
||||
|
||||
|
||||
|
||||
def generateRawDataset(datasetDir="/data/SEMS-model-training/dataset/raw_dataset",datasetNum=1000,validationRate=0.2,testRate=0.1):
|
||||
|
||||
datasetDir=Path(datasetDir)
|
||||
|
||||
|
||||
oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset")
|
||||
oldRawDatasetFileNames=os.listdir(oldRawDatasetDir)
|
||||
oldRawDatasetFileNames.sort()
|
||||
oldRawDatasetNum=len(oldRawDatasetFileNames)
|
||||
# latestFurnaceID=oldRawDatasetFileNames[-1]
|
||||
|
||||
DBdataNum,lastesFurnaceNumber=getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC")
|
||||
|
||||
|
||||
|
||||
|
||||
if os.path.exists(datasetDir):
|
||||
#如果存在数据集,则判断是否为最新
|
||||
if os.path.exists(datasetDir/"validation_set"):
|
||||
#"通过判断文件中的最新炉次号与已存在的数据集中的一不一样
|
||||
valNames=os.listdir(datasetDir/"validation_set")
|
||||
valNames.sort()
|
||||
latestDatasetFurnaceID=valNames[-1]
|
||||
if lastesFurnaceNumber==latestDatasetFurnaceID:
|
||||
"如果数据集已经是最新的,那就直接返回,不进行任何操作"
|
||||
return None
|
||||
else:
|
||||
shutil.rmtree(datasetDir)
|
||||
else:
|
||||
shutil.rmtree(datasetDir)
|
||||
os.makedirs(datasetDir)
|
||||
|
||||
chooseDBdataNum=0
|
||||
chooseOLDRawDataNum=0
|
||||
if (DBdataNum+oldRawDatasetNum)>datasetNum:
|
||||
if DBdataNum>=datasetNum:
|
||||
chooseDBdataNum=datasetNum
|
||||
chooseOLDRawDataNum=0
|
||||
else:
|
||||
chooseDBdataNum=DBdataNum
|
||||
chooseOLDRawDataNum=datasetNum-DBdataNum
|
||||
else:
|
||||
chooseDBdataNum=DBdataNum
|
||||
chooseOLDRawDataNum=oldRawDatasetNum
|
||||
|
||||
print(f"旧数据数量:{chooseOLDRawDataNum}, 新数据数量{chooseDBdataNum}")
|
||||
|
||||
if chooseDBdataNum>0:
|
||||
#
|
||||
DBDataset=getLatestData(chooseDBdataNum,deviceId='DOJHBG',measureType="TSC")
|
||||
else:
|
||||
DBDataset=[]
|
||||
|
||||
rawDatasetCache=oldRawDatasetFileNames[-chooseOLDRawDataNum:]+DBDataset
|
||||
print(rawDatasetCache[-1].keys())
|
||||
datasetNum=len(rawDatasetCache)
|
||||
valNum=int(datasetNum*validationRate)
|
||||
testNum=int(datasetNum*testRate)
|
||||
trainNum=datasetNum-valNum-testNum
|
||||
|
||||
saveDataset(rawDatasetCache[:trainNum], datasetDir,datasetType="training")
|
||||
saveDataset(rawDatasetCache[-testNum-valNum:-testNum], datasetDir,datasetType="validation")
|
||||
|
||||
saveDataset(rawDatasetCache[-testNum:], datasetDir,datasetType="test")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ =="__main__":
|
||||
tmp=generateRawDataset()
|
||||
# getMetadataFromMySQL()
|
||||
|
||||
# getLatestData(dataNum=1)
|
||||
|
||||
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
@ -33,6 +33,7 @@ class LabelScaling:
|
||||
self._min[i]=label[i]
|
||||
|
||||
def run(self,label):
|
||||
|
||||
for i in range(label.shape[0]):
|
||||
label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
|
||||
return label
|
||||
@ -42,4 +43,3 @@ class LabelScaling:
|
||||
return labels
|
||||
|
||||
|
||||
|
75
src/data/labels_scaling/standardization.py
Normal file
75
src/data/labels_scaling/standardization.py
Normal file
@ -0,0 +1,75 @@
|
||||
'''
|
||||
@File : max_min.py
|
||||
@Time : 2024/08/12 14:03:38
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 对输出标签进行标准化放缩
|
||||
'''
|
||||
|
||||
|
||||
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
class LabelScaling:
|
||||
def __init__(self):
|
||||
self.description=f"{str(self.__class__).split(".")[-2]}"
|
||||
self.flag_get_global_info=True
|
||||
self._min=None
|
||||
self._max=None
|
||||
self._mean=None
|
||||
self._var=None
|
||||
self._num=0
|
||||
|
||||
self.file_path = Path(__file__).absolute()
|
||||
|
||||
|
||||
|
||||
def load_state_dict(self,state):
|
||||
self._min=state["_min"]
|
||||
self._max=state["_max"]
|
||||
self._mean=state["_mean"]
|
||||
self._var=state["_var"]
|
||||
self._num=state["_num"]
|
||||
def state_dict(self):
|
||||
return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num}
|
||||
|
||||
|
||||
def get_global_info(self,label):
|
||||
label=label.astype(np.float32)
|
||||
label_dim=label.shape[0]
|
||||
if self._max is None:
|
||||
self._min=np.zeros((label_dim),dtype=np.float32)+999999999
|
||||
self._max=np.zeros((label_dim),dtype=np.float32)
|
||||
self._mean=np.zeros((label_dim),dtype=np.float32)
|
||||
self._var=np.zeros((label_dim),dtype=np.float32)
|
||||
self._num=0
|
||||
|
||||
self._min=np.minimum(self._min,label)
|
||||
self._max=np.maximum(self._max,label)
|
||||
|
||||
if self._num>0:
|
||||
self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(label-self._mean)*(label-self._mean)
|
||||
#要先更新var再更新mean
|
||||
self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*label
|
||||
self._num =self._num+1
|
||||
|
||||
# print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max)
|
||||
# print("_var:",self._var)
|
||||
|
||||
|
||||
|
||||
def run(self,label):
|
||||
# for i in range(label.shape[0]):
|
||||
# label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
|
||||
label=(label-self._mean)/np.sqrt(self._var)
|
||||
|
||||
return label
|
||||
def reverse(self,labels):
|
||||
# for i in range(labels.shape[1]):
|
||||
# labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
|
||||
|
||||
labels=labels*np.sqrt(self._var)+self._mean
|
||||
return labels
|
||||
|
||||
|
@ -14,6 +14,8 @@ import numpy as np
|
||||
import datetime
|
||||
import pickle
|
||||
import h5py
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
class DataPreProcess:
|
||||
"""
|
||||
@ -23,274 +25,85 @@ class DataPreProcess:
|
||||
3. 将feature 与 Label进行放缩
|
||||
|
||||
"""
|
||||
def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,choose_frame_spatial, features_scaling,labels_scaling,train_ratio=0.8,validate_ratio=0.1) -> None:
|
||||
def __init__(self, dataset_dir,choose_frame_spatial, features_scaling,labels_scaling) -> None:
|
||||
"""初始化类,传入所有数据预处理需要的参数
|
||||
|
||||
Args:
|
||||
raw_spectral_data_dir (_type_):原始光谱数据路径
|
||||
raw_labels_dir (_type_): 原始标签路径
|
||||
labels_name (_type_): 提取出的标签名
|
||||
dataset_dir (_type_): 生成的数据集文件夹
|
||||
choose_frame_spatial (_type_): 类对象,进行光谱特征挑选
|
||||
features_scaling (_type_): 类对象,对输入的特征进行放缩,使之直接输入神经网络
|
||||
labels_scaling (_type_): 类对象,对输出的的标签进行放缩
|
||||
train_ratio (float, optional): 训练集比例. Defaults to 0.8.
|
||||
validate_ratio (float, optional): 验证集比例,剩余的即为测试集. Defaults to 0.1.
|
||||
"""
|
||||
self.raw_spectral_data_dir=raw_spectral_data_dir
|
||||
self.raw_labels_dir=raw_labels_dir
|
||||
self.dataset_dir=dataset_dir
|
||||
self.labels_name=labels_name
|
||||
|
||||
self.dataset_dir=Path(dataset_dir)
|
||||
|
||||
self.choose_frame_spatial=choose_frame_spatial
|
||||
self.features_scaling=features_scaling
|
||||
self.labels_scaling=labels_scaling
|
||||
self.train_ratio=train_ratio
|
||||
self.validate_ratio=validate_ratio
|
||||
self.labelNames=None
|
||||
|
||||
#加载原始的标签
|
||||
self.raw_labels=self._load_raw_labels(raw_labels_dir,labels_name)
|
||||
# _raw_data_2_dataset(self)
|
||||
self.run()
|
||||
|
||||
#加载原始光谱的缓存
|
||||
self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
|
||||
def _raw_data_2_pre_dataset(self,pre_dataset_save_dir,dataset_type="test"):
|
||||
|
||||
#随便加载一个csv文件,判断光谱数据的维度
|
||||
self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
|
||||
rawdata_names= os.listdir(self.dataset_dir/"raw_dataset"/dataset_type)
|
||||
if not os.path.exists(pre_dataset_save_dir/dataset_type):
|
||||
os.makedirs(pre_dataset_save_dir/dataset_type)
|
||||
print(f"[预处理]: {dataset_type}数据集选择特征时间与空间点 ")
|
||||
for name in tqdm(rawdata_names):
|
||||
|
||||
#正式开始原始数据转化为数据集
|
||||
self._raw_data_2_dataset()
|
||||
tmp_data=np.load(self.dataset_dir/"raw_dataset"/dataset_type/name,allow_pickle=True)
|
||||
|
||||
raw_spectral_data=self.choose_frame_spatial.run(tmp_data["measureStartDatetime"],tmp_data["measureEndDatetime"],tmp_data["timestamps"],tmp_data["rawSpectralData"])
|
||||
|
||||
flag_data_augmentation=False
|
||||
################################
|
||||
#此段代码是为数据增强,raw_spectral_data是二维数据,代表多个光谱曲线,使每个光谱曲线都对应相同的label。
|
||||
|
||||
def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
|
||||
"""读取利用脚本处理后的钢厂给的excel文件所在文件夹(会扫描所有文件)
|
||||
并选择指定的label名作为output
|
||||
并去除值为NaN或0的行
|
||||
|
||||
Args:
|
||||
raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
|
||||
labels_name (list): 指定的作为Label的列
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: 返回所有筛选后的炉次数据
|
||||
"""
|
||||
|
||||
raw_labels=None
|
||||
for name in os.listdir(raw_labels_dir):
|
||||
tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
|
||||
|
||||
choosed_column=["TSC_start_time","TSC_end_time"]
|
||||
choosed_column=choosed_column+labels_name
|
||||
|
||||
|
||||
#只选出我们想要的部分作为标签
|
||||
tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
|
||||
|
||||
# 选出有NULL的行
|
||||
null_rows=tmp_raw_labels.isnull().any(axis=1)
|
||||
# # 选出有0的行
|
||||
zeros_rows=(tmp_raw_labels==0).any(axis=1) | (tmp_raw_labels=='0').any(axis=1)
|
||||
|
||||
# # 每一行但凡有NULL或者0都给删了
|
||||
selected_rows=~(null_rows|zeros_rows)
|
||||
tmp_raw_labels=tmp_raw_labels[selected_rows]
|
||||
|
||||
if raw_labels is None:
|
||||
raw_labels=tmp_raw_labels
|
||||
else:
|
||||
raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
|
||||
logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
|
||||
logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
|
||||
|
||||
return raw_labels
|
||||
def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
|
||||
"""生成所有原始光谱数据文件的缓存,包括每个文件记录的开始及结束时间,目的为加快后面读取原始数据的速度
|
||||
|
||||
Args:
|
||||
raw_spectral_data_dir (str): 原始光谱所在路径
|
||||
|
||||
Returns:
|
||||
list: 缓存,其中有多少个成员,就有多少个原始数据文件,每个成员包括格式为datetime的开始及结束时间,及文件路径
|
||||
"""
|
||||
spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
|
||||
cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
|
||||
|
||||
update_flag=False
|
||||
|
||||
|
||||
#不存在缓存文件,以及缓存文件中数据文件个数与文件夹中的文件个数不一致,均重新生成
|
||||
if len(cache_file_paths)==0:
|
||||
logging.debug(f"Raw spectral data cache is not existed! Generating")
|
||||
update_flag=True
|
||||
elif len(cache_file_paths)==1:
|
||||
with open(cache_file_paths[0],"rb") as f:
|
||||
raw_spectral_data_cache=pickle.load(f)
|
||||
if len(raw_spectral_data_cache) !=len(spectral_file_paths):
|
||||
logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
|
||||
update_flag=True
|
||||
else:
|
||||
logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
|
||||
|
||||
if update_flag:
|
||||
spectral_file_paths.sort()
|
||||
raw_spectral_data_cache=[]
|
||||
for file in spectral_file_paths:
|
||||
tmp_info={}
|
||||
tmp_data=np.loadtxt(file, delimiter=",")
|
||||
start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
|
||||
end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
|
||||
tmp_info["start_t"]=start_t
|
||||
tmp_info["end_t"]=end_t
|
||||
tmp_info["file_path"]=file
|
||||
raw_spectral_data_cache.append(tmp_info)
|
||||
|
||||
with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
|
||||
pickle.dump(raw_spectral_data_cache,f)
|
||||
|
||||
return raw_spectral_data_cache
|
||||
def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
|
||||
data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
|
||||
if data[0,2]==229376:
|
||||
spectral_dim =224
|
||||
spatial_dim=512
|
||||
if data[0,2]==917504:
|
||||
spectral_dim =448
|
||||
spatial_dim=1024
|
||||
return spectral_dim, spatial_dim
|
||||
|
||||
def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
|
||||
"""获取从start_time到end_time的光谱数据
|
||||
|
||||
Args:
|
||||
start_time (datetime.datetime): 开始时间
|
||||
end_time (datetime.datetime): 结束时间
|
||||
|
||||
Returns:
|
||||
np.ndarray: 原始光谱数据
|
||||
"""
|
||||
|
||||
|
||||
def get_spectral_data_per_file(file_path,s_t,e_t):
|
||||
data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
|
||||
|
||||
if s_t is not None:
|
||||
tmp_s=datetime.datetime.timestamp(s_t)*1000
|
||||
tmp_s_index=0
|
||||
for i in range(data.shape[0]-1):
|
||||
if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
|
||||
tmp_s_index=i
|
||||
break
|
||||
else:
|
||||
tmp_s_index=0
|
||||
if e_t is not None:
|
||||
tmp_e=datetime.datetime.timestamp(e_t)*1000
|
||||
tmp_e_index=data.shape[0]
|
||||
for i in range(tmp_s_index,data.shape[0]-1):
|
||||
if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
|
||||
tmp_e_index=i
|
||||
break
|
||||
else:
|
||||
tmp_e_index=data.shape[0]
|
||||
|
||||
with open(file_path[:-3]+"bin", "rb") as f:
|
||||
f.seek(data[tmp_s_index,1])
|
||||
d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
|
||||
d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
|
||||
return data[tmp_s_index:tmp_e_index,0],d
|
||||
|
||||
timestamps=None
|
||||
raw_spectral_data=None
|
||||
for tmp_info in self.raw_spectral_data_cache:
|
||||
tmp_data=None
|
||||
if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and end_time<tmp_info["end_t"]:
|
||||
# 目标时间段在交于此文件时间段的左侧。所以取从文件开始到,end time的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
|
||||
|
||||
elif start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"]:
|
||||
# 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
|
||||
elif start_time>tmp_info["start_t"] and end_time<tmp_info["end_t"]:
|
||||
# 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
|
||||
elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and end_time>tmp_info["end_t"]:
|
||||
# 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
|
||||
if tmp_data is not None:
|
||||
if raw_spectral_data is None:
|
||||
timestamps=tmp_time_stamp
|
||||
raw_spectral_data=tmp_data
|
||||
else:
|
||||
timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
|
||||
raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
|
||||
return timestamps,raw_spectral_data
|
||||
|
||||
def _raw_data_2_dataset(self):
|
||||
|
||||
save_dir=os.path.join(self.dataset_dir,self.choose_frame_spatial.description)
|
||||
|
||||
#第一步,进行特征时间与空间点选取,并保存
|
||||
|
||||
pre_dataset_save_dir=os.path.join(save_dir,"data")
|
||||
|
||||
if not os.path.exists(pre_dataset_save_dir):
|
||||
os.makedirs(pre_dataset_save_dir)
|
||||
|
||||
#数据路径不存在就重新生成,如果存在,就跳过这部分。
|
||||
# !!!!!!!!!!因此需要额外注意,如果有新数据,需要把dataset下的文件夹都删除,相当于删除缓存,重新生成
|
||||
for i in range(self.raw_labels.shape[0]):
|
||||
|
||||
start_time,end_time=self.choose_frame_spatial.get_data_interval(self.raw_labels.iloc[i]["TSC_start_time"],self.raw_labels.iloc[i]["TSC_end_time"])
|
||||
timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
|
||||
if flag_data_augmentation:
|
||||
|
||||
if raw_spectral_data is not None:
|
||||
#获取到的数据帧率大于2才开始记录
|
||||
if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
|
||||
logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
|
||||
raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
|
||||
np.savez(os.path.join(pre_dataset_save_dir,f"{timestamps[0]}_{timestamps.shape[0]}.npz",),timestamps=timestamps,raw_spectral_data=raw_spectral_data,raw_labels=self.raw_labels.iloc[i][self.labels_name].to_numpy())
|
||||
|
||||
for i in range(raw_spectral_data.shape[0]):
|
||||
np.savez(pre_dataset_save_dir/dataset_type/f"{name[:-4]}_{i}.npz",furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data[i,...],rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
###############################
|
||||
else:
|
||||
logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
|
||||
if raw_spectral_data is not None:
|
||||
np.savez(pre_dataset_save_dir/dataset_type/name,furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data,rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"])
|
||||
|
||||
#第二步,进行标准化,并形成数据集
|
||||
#####################
|
||||
|
||||
self.dataset_save_dir=os.path.join(save_dir,f"{self.features_scaling.description}_{self.labels_scaling.description}")
|
||||
def _pre_dataset_2_dataset(self,pre_dataset_save_dir,dataset_save_dir,dataset_type="test",savaFlag=True):
|
||||
|
||||
if not os.path.exists(self.dataset_save_dir):
|
||||
os.makedirs(self.dataset_save_dir)
|
||||
pre_dataset_files=os.listdir(pre_dataset_save_dir)
|
||||
print(f"[预处理]: {dataset_type}数据集进行数据预处理")
|
||||
pre_dataset_files=os.listdir(pre_dataset_save_dir/dataset_type)
|
||||
|
||||
if self.features_scaling.flag_get_global_info:
|
||||
|
||||
|
||||
if dataset_type=="training" and (self.features_scaling.flag_get_global_info or self.labels_scaling.flag_get_global_info):
|
||||
for name in pre_dataset_files:
|
||||
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
|
||||
self.features_scaling.get_global_info(tmp_data["raw_spectral_data"])
|
||||
tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True)
|
||||
if self.labelNames is None:
|
||||
self.labelNames=tmp_data["labelNames"]
|
||||
if self.features_scaling.flag_get_global_info:
|
||||
self.features_scaling.get_global_info(tmp_data["rawSpectralData"])
|
||||
if self.labels_scaling.flag_get_global_info:
|
||||
self.labels_scaling.get_global_info(tmp_data["raw_labels"])
|
||||
|
||||
# 获取维度信息
|
||||
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
|
||||
feature_dim=tmp_data["raw_spectral_data"].shape
|
||||
label_dim=tmp_data["raw_labels"].shape
|
||||
self.labels_scaling.get_global_info(tmp_data["rawLabels"])
|
||||
np.set_printoptions(suppress = True)
|
||||
# print(f"[预处理]Label全局信息:_num",self.labels_scaling._num,"_mean:",self.labels_scaling._mean,"_var:",self.labels_scaling._var,"_min:",self.labels_scaling._min,"_max:",self.labels_scaling._max)
|
||||
|
||||
|
||||
#划分训练集_验证集_测试集,
|
||||
np.random.shuffle(pre_dataset_files)
|
||||
[train_dateset_files,validate_dateset_files]=np.split(pre_dataset_files,[int(len(pre_dataset_files)*self.train_ratio)])
|
||||
[validate_dateset_files,test_dateset_files]=np.split(validate_dateset_files,[int(len(pre_dataset_files)*self.validate_ratio)])
|
||||
if savaFlag:
|
||||
|
||||
tmp_data=np.load(pre_dataset_save_dir/dataset_type/pre_dataset_files[0],allow_pickle=True)
|
||||
feature_dim=tmp_data["rawSpectralData"].shape
|
||||
label_dim=tmp_data["rawLabels"].shape
|
||||
|
||||
#写入HDF5文件
|
||||
for dataset_name in ["train","validate","test"]:
|
||||
if dataset_name=="train":
|
||||
file_names=train_dateset_files
|
||||
elif dataset_name=="validate":
|
||||
file_names=validate_dateset_files
|
||||
elif dataset_name=="test":
|
||||
file_names=test_dateset_files
|
||||
|
||||
logging.info(f"Generating {dataset_name} dataset with {len(file_names)} samples")
|
||||
with h5py.File(os.path.join( self.dataset_save_dir,f"{dataset_name}.h5"), 'w') as f:
|
||||
with h5py.File(os.path.join( dataset_save_dir,f"{dataset_type}.h5"), 'w') as f:
|
||||
h5_features = f.create_dataset(
|
||||
'features',
|
||||
tuple([len(file_names)]+list(feature_dim)),
|
||||
tuple([len(pre_dataset_files)]+list(feature_dim)),
|
||||
maxshape=(tuple([None]+list(feature_dim))),
|
||||
chunks=tuple([1]+list(feature_dim)), # 手动设置 chunk 大小为一次数据的大小tuple([1]+list(feature_dim))
|
||||
# compression='gzip',
|
||||
@ -299,52 +112,104 @@ class DataPreProcess:
|
||||
)
|
||||
h5_labels = f.create_dataset(
|
||||
'labels',
|
||||
tuple([len(file_names)]+list(label_dim)),
|
||||
tuple([len(pre_dataset_files)]+list(label_dim)),
|
||||
chunks=tuple([1]+list(label_dim)), # 手动设置 chunk 大小为一次数据的大小
|
||||
dtype=np.float32,
|
||||
)
|
||||
for i,name in enumerate(file_names):
|
||||
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
|
||||
for i,name in enumerate(pre_dataset_files):
|
||||
tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True)
|
||||
|
||||
feature=self.features_scaling.run(tmp_data["raw_spectral_data"])
|
||||
label=self.labels_scaling.run(tmp_data["raw_labels"])
|
||||
feature=self.features_scaling.run(tmp_data["rawSpectralData"])
|
||||
label=self.labels_scaling.run(tmp_data["rawLabels"])
|
||||
h5_features[i]=feature.astype(np.float32)
|
||||
h5_labels[i]=label.astype(np.float32)
|
||||
else:
|
||||
pre_dataset_files=os.listdir(pre_dataset_save_dir)
|
||||
|
||||
if self.labels_scaling.flag_get_global_info:
|
||||
for name in pre_dataset_files:
|
||||
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
|
||||
self.labels_scaling.get_global_info(tmp_data["raw_labels"])
|
||||
|
||||
def run(self):
|
||||
|
||||
save_dir=self.dataset_dir/self.choose_frame_spatial.description
|
||||
|
||||
#第一步,进行特征时间与空间点选取,并保存
|
||||
|
||||
pre_dataset_save_dir=save_dir/"pre_dataset"
|
||||
|
||||
if not os.path.exists(pre_dataset_save_dir):
|
||||
os.makedirs(pre_dataset_save_dir)
|
||||
|
||||
#数据路径不存在就重新生成,如果存在,就跳过这部分。
|
||||
# !!!!!!!!!!因此需要额外注意,如果有新数据,需要把dataset下的文件夹都删除,相当于删除缓存,重新生成
|
||||
self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="training")
|
||||
self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="validation")
|
||||
self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="test")
|
||||
|
||||
else:
|
||||
logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
|
||||
|
||||
#第二步,进行标准化,并形成数据集
|
||||
|
||||
self.dataset_save_dir=save_dir / f"{self.features_scaling.description}_{self.labels_scaling.description}"
|
||||
|
||||
if not os.path.exists(self.dataset_save_dir):
|
||||
os.makedirs(self.dataset_save_dir)
|
||||
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=True)
|
||||
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=True)
|
||||
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=True)
|
||||
|
||||
|
||||
else:
|
||||
|
||||
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=False)
|
||||
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=False)
|
||||
self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=False)
|
||||
logging.info(f"Standardized Dataset is existed in {self.dataset_save_dir}")
|
||||
|
||||
def get_metric(self,outputs,labels):
|
||||
# print("outputs Before",outputs)
|
||||
outputs=self.labels_scaling.reverse(outputs)
|
||||
# print("outputs After",outputs)
|
||||
# print("labels Before",labels)
|
||||
labels=self.labels_scaling.reverse(labels)
|
||||
# print("labels After",labels)
|
||||
error=outputs-labels
|
||||
|
||||
# print("outputs",outputs)
|
||||
# print("labels",labels)
|
||||
# print("errors",error)
|
||||
if len(error.shape)==1:
|
||||
hit_rate=np.zeros((1))
|
||||
else:
|
||||
hit_rate=np.zeros(error.shape[1])
|
||||
for i,name in enumerate(self.labels_name):
|
||||
# error[:,i]=outputs[:,i]-labels[:,i]
|
||||
|
||||
#["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
|
||||
bounds=np.zeros(error.shape)
|
||||
|
||||
if name =="TSC_T":
|
||||
bound=10
|
||||
elif name=="TSC_C":
|
||||
bound=0.05
|
||||
elif name=="TSC_C_lab":
|
||||
bound=0.05
|
||||
elif name=="TSC_P_lab":
|
||||
bound=0.0005
|
||||
# ["Temperature","C","P","S","Mn","Ni","Mo","Cr"]
|
||||
|
||||
|
||||
hit_rate[i]=((error[:,i]>=-bound) &(error[:,i]<=bound)).sum() /error.shape[0]
|
||||
return error,hit_rate
|
||||
for i,name in enumerate(self.labelNames):
|
||||
|
||||
if name =="Temperature":
|
||||
bounds[:,i]=10.0+np.zeros(labels[:,i].shape)
|
||||
elif name=="C":
|
||||
bounds[:,i]=0.05+np.zeros(labels[:,i].shape)
|
||||
elif name=="P":
|
||||
bounds[:,i]=0.005+np.zeros(labels[:,i].shape)
|
||||
elif name=="S":
|
||||
bounds[:,i]=0.01+np.zeros(labels[:,i].shape)
|
||||
elif name=="Mn":
|
||||
bounds[:,i]=0.05+np.zeros(labels[:,i].shape)
|
||||
elif name=="Ni":
|
||||
bounds[:,i]=0.25*labels[:,i]
|
||||
elif name=="Mo":
|
||||
bounds[:,i]=0.25*labels[:,i]
|
||||
elif name=="Cr":
|
||||
bounds[:,i]=0.25*labels[:,i]
|
||||
|
||||
# if isinstance(bound, float):
|
||||
|
||||
# print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound}")
|
||||
# else:
|
||||
# print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound[0]}")
|
||||
|
||||
hit_rate[i]=((error[:,i]>=-bounds[:,i]) &(error[:,i]<=bounds[:,i])).sum() /error.shape[0]
|
||||
|
||||
return outputs, labels,error,hit_rate,bounds
|
||||
|
||||
|
||||
|
||||
@ -355,19 +220,13 @@ if __name__=="__main__":
|
||||
logging.basicConfig(level = logging.DEBUG)
|
||||
|
||||
|
||||
raw_data_dir="/code/admin/20240806-NanEr-5-8-data/rawdata"
|
||||
labels_path="/code/admin/20240806-NanEr-5-8-data/labels/NanEr"
|
||||
dataset_dir="/code/admin/20240806-NanEr-5-8-data/dataset"
|
||||
labels_name=["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
|
||||
datasetDir="/data/SEMS-model-training/dataset"
|
||||
|
||||
|
||||
from choose_frame_spatial.mean import ChooseFrameSpatial
|
||||
from features_scaling.max_min import FeatureScaling
|
||||
from labels_scaling.max_min import LabelScaling
|
||||
choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
|
||||
from choose_frame_spatial.mean_CARS100_3 import ChooseFrameSpatial
|
||||
from features_scaling.standardization import FeatureScaling
|
||||
from labels_scaling.standardization import LabelScaling
|
||||
choose_frame_spatial=ChooseFrameSpatial()
|
||||
features_scaling=FeatureScaling()
|
||||
labels_scaling=LabelScaling()
|
||||
|
||||
|
||||
|
||||
data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir,choose_frame_spatial,features_scaling,labels_scaling)
|
||||
data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling)
|
@ -4,10 +4,10 @@ import os
|
||||
import time
|
||||
|
||||
|
||||
def load_dataset(dataset_dir):
|
||||
train_dataset=SpectralDataset(os.path.join(dataset_dir,"train.h5"))
|
||||
val_dataset=SpectralDataset(os.path.join(dataset_dir,"validate.h5"))
|
||||
return train_dataset, val_dataset
|
||||
def load_dataset(dataset_dir,dataset_type="training"):
|
||||
dataset=SpectralDataset(os.path.join(dataset_dir,f"{dataset_type}.h5"))
|
||||
# val_dataset=SpectralDataset(os.path.join(dataset_dir,"validation.h5"))
|
||||
return dataset
|
||||
|
||||
class SpectralDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, hdf5_path):
|
||||
@ -17,8 +17,8 @@ class SpectralDataset(torch.utils.data.Dataset):
|
||||
# rdcc_w0=0, # 缓存淘汰策略。值越接近 0,优先淘汰最近最少使用的 chunk。值越接近 1,优先淘汰已完全读取或写入的块
|
||||
# rdcc_nslots=1e8, # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍,为达最佳性能需要 100 倍。默认为 521
|
||||
)
|
||||
self.features=self.hdf5_f["features"]
|
||||
self.labels=self.hdf5_f["labels"]
|
||||
self.features=self.hdf5_f["features"][...]
|
||||
self.labels=self.hdf5_f["labels"][...]
|
||||
def __len__(self):
|
||||
return self.features.shape[0]
|
||||
|
||||
@ -26,10 +26,15 @@ class SpectralDataset(torch.utils.data.Dataset):
|
||||
|
||||
|
||||
#构造Input
|
||||
|
||||
feature=self.features[idx]
|
||||
|
||||
label=self.labels[idx]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return feature, label
|
||||
|
||||
|
||||
@ -37,7 +42,7 @@ class SpectralDataset(torch.utils.data.Dataset):
|
||||
if __name__ =="__main__":
|
||||
|
||||
|
||||
training_data=SpectralDataset("/home/admin/20240806-NanEr-5-8-data/dataset/mean_-30_30/max_min_max_min/train.h5")
|
||||
training_data=SpectralDataset("/data/SEMS-model-training/dataset/ChooseFrameSpatial'>_-30_30/max_min_max_min/training.h5")
|
||||
print("数据集大小为:",len(training_data))
|
||||
print("输入Feature维度为:", training_data[0][0].shape, "输出Label维度为:",training_data[0][1].shape)
|
||||
|
||||
|
126
src/data/test.ipynb
Normal file
126
src/data/test.ipynb
Normal file
@ -0,0 +1,126 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[预处理]: training数据集进行数据预处理\n",
|
||||
"[预处理]Label全局信息:_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n",
|
||||
" 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n",
|
||||
" 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n",
|
||||
" 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n",
|
||||
" 0.522 ]\n",
|
||||
"[预处理]: validation数据集进行数据预处理\n",
|
||||
"[预处理]Label全局信息:_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n",
|
||||
" 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n",
|
||||
" 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n",
|
||||
" 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n",
|
||||
" 0.522 ]\n",
|
||||
"[预处理]: test数据集进行数据预处理\n",
|
||||
"[预处理]Label全局信息:_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n",
|
||||
" 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n",
|
||||
" 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n",
|
||||
" 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n",
|
||||
" 0.522 ]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pre_process import DataPreProcess\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"datasetDir=\"/data/SEMS-model-training/dataset\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from choose_frame_spatial.DBSCAN import ChooseFrameSpatial\n",
|
||||
"from features_scaling.standardization import FeatureScaling\n",
|
||||
"from labels_scaling.standardization import LabelScaling\n",
|
||||
"choose_frame_spatial=ChooseFrameSpatial()\n",
|
||||
"features_scaling=FeatureScaling()\n",
|
||||
"labels_scaling=LabelScaling()\n",
|
||||
"data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([1591.5325 , 0.39140984, 0.04271309, 0.02187012,\n",
|
||||
" 0.14065096, 0.02857196, 0.00202732, 0.10250819],\n",
|
||||
" dtype=float32)"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"labels_scaling._mean"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Before: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n",
|
||||
"Scaled: [0.7486169269676385, 0.46089706984319634, 1.2160017876067477, 7.584260858761072, 2.26002739629723, 0.612572700572906, 0.21319305953525988, 13.385111606844243]\n",
|
||||
"Reversed\t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n",
|
||||
"Before \t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"labels= [1612.5325 , 0.450984, 0.067271309 , 0.05687012 , 0.2365096, 0.06357196 , 0.0034502732 , 0.450250819]\n",
|
||||
"print(f\"Before: {labels}\")\n",
|
||||
"tmp=labels_scaling.run(labels)\n",
|
||||
"print(f\"Scaled: {list(tmp)}\")\n",
|
||||
"labels_r=labels_scaling.reverse(tmp)\n",
|
||||
"print(f\"Reversed\\t: {list(labels_r)}\")\n",
|
||||
"print(f\"Before \\t: {labels}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "dl",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -5,6 +5,10 @@ import ray,os
|
||||
import ray.tune
|
||||
from functools import partial
|
||||
|
||||
import shutil
|
||||
|
||||
import subprocess
|
||||
|
||||
|
||||
|
||||
|
||||
@ -24,6 +28,16 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None:
|
||||
hydra_output_dir=hydra_output_dir['runtime']['output_dir']
|
||||
|
||||
|
||||
command = f"""
|
||||
sudo kill -9 $(sudo lsof -t -i :6006);
|
||||
tensorboard --logdir {os.path.join(hydra_output_dir,"ray-tune")} --host 0.0.0.0 --port 6006
|
||||
"""
|
||||
|
||||
|
||||
# 创建子进程
|
||||
tensorboard_subprocess = subprocess.Popen(command, shell=True, executable='/bin/bash',start_new_session=True)
|
||||
|
||||
|
||||
#获取当前超参数配置
|
||||
tune_cfg={}
|
||||
for key in hydra_cfg.ray_tune.config.keys():
|
||||
@ -44,10 +58,24 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None:
|
||||
)
|
||||
|
||||
#保存最好的模型
|
||||
|
||||
best_trial = result.get_best_trial("val_loss", "min", "last")
|
||||
|
||||
|
||||
|
||||
|
||||
print(f"Best trial config: {best_trial.config}")
|
||||
print(f"Best trial final validation loss: {best_trial.last_result['val_loss']}")
|
||||
|
||||
print(f"Best trial checkpoint path: {best_trial.checkpoint.path}")
|
||||
|
||||
shutil.copytree(best_trial.checkpoint.path, os.path.join(hydra_output_dir,"best-model"))
|
||||
|
||||
# tensorboard_subprocess.terminate()
|
||||
# tensorboard_subprocess.wait()
|
||||
# print("TensorBoard terminated")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
160
src/model/DRSN-CW.py
Normal file
160
src/model/DRSN-CW.py
Normal file
@ -0,0 +1,160 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class BasicBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, in_channels, out_channels, stride=1):
|
||||
super().__init__()
|
||||
self.shrinkage = Shrinkage(out_channels, gap_size=(1))
|
||||
# residual function
|
||||
self.residual_function = nn.Sequential(
|
||||
nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
|
||||
nn.BatchNorm1d(out_channels),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Conv1d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
|
||||
nn.BatchNorm1d(out_channels * BasicBlock.expansion),
|
||||
self.shrinkage
|
||||
)
|
||||
# shortcut
|
||||
self.shortcut = nn.Sequential()
|
||||
|
||||
# the shortcut output dimension is not the same with residual function
|
||||
# use 1*1 convolution to match the dimension
|
||||
if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv1d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
|
||||
nn.BatchNorm1d(out_channels * BasicBlock.expansion)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
|
||||
# a = self.residual_function(x),
|
||||
# b = self.shortcut(x),
|
||||
# c = a+b
|
||||
# return c
|
||||
|
||||
|
||||
class Shrinkage(nn.Module):
|
||||
def __init__(self, channel, gap_size):
|
||||
super(Shrinkage, self).__init__()
|
||||
self.gap = nn.AdaptiveAvgPool1d(gap_size)
|
||||
self.fc = nn.Sequential(
|
||||
nn.Linear(channel, channel),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(channel, channel),
|
||||
nn.Sigmoid(),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x_raw = x
|
||||
x = torch.abs(x)
|
||||
x_abs = x
|
||||
x = self.gap(x)
|
||||
x = torch.flatten(x, 1)
|
||||
# average = torch.mean(x, dim=1, keepdim=True) #CS
|
||||
average = x #CW
|
||||
x = self.fc(x)
|
||||
x = torch.mul(average, x)
|
||||
x = x.unsqueeze(2)
|
||||
# soft thresholding
|
||||
sub = x_abs - x
|
||||
zeros = sub - sub
|
||||
n_sub = torch.max(sub, zeros)
|
||||
x = torch.mul(torch.sign(x_raw), n_sub)
|
||||
return x
|
||||
|
||||
|
||||
class SpectralModel(nn.Module):
|
||||
|
||||
def __init__(self, block=BasicBlock, num_block=[3, 4, 6, 3], num_classes=8):
|
||||
super().__init__()
|
||||
|
||||
self.in_channels = 64
|
||||
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv1d(1, 64, kernel_size=3, padding=1, bias=False),
|
||||
nn.BatchNorm1d(64),
|
||||
nn.ReLU(inplace=True))
|
||||
# we use a different inputsize than the original paper
|
||||
# so conv2_x's stride is 1
|
||||
self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
|
||||
self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
|
||||
self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
|
||||
self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
|
||||
self.avg_pool = nn.AdaptiveAvgPool1d((1))
|
||||
# self.fc = nn.Linear(512 * block.expansion, 1024)
|
||||
# self.fc = nn.Linear(1024 , 512)
|
||||
# self.fc = nn.Linear(512, 512)
|
||||
self.fc = nn.Sequential(
|
||||
nn.BatchNorm1d(512),
|
||||
nn.Linear(512 * block.expansion, 1024),
|
||||
nn.ReLU(inplace=True),nn.BatchNorm1d(1024),
|
||||
nn.Linear(1024 , 512),
|
||||
nn.ReLU(inplace=True),nn.BatchNorm1d(512),
|
||||
nn.Linear(512, 128),
|
||||
nn.ReLU(inplace=True),nn.BatchNorm1d(128),
|
||||
nn.Linear(128, 8),)
|
||||
|
||||
def _make_layer(self, block, out_channels, num_blocks, stride):
|
||||
"""make rsnet layers(by layer i didnt mean this 'layer' was the
|
||||
same as a neuron netowork layer, ex. conv layer), one layer may
|
||||
contain more than one residual shrinkage block
|
||||
|
||||
Args:
|
||||
block: block type, basic block or bottle neck block
|
||||
out_channels: output depth channel number of this layer
|
||||
num_blocks: how many blocks per layer
|
||||
stride: the stride of the first block of this layer
|
||||
|
||||
Return:
|
||||
return a rsnet layer
|
||||
"""
|
||||
|
||||
# we have num_block blocks per layer, the first block
|
||||
# could be 1 or 2, other blocks would always be 1
|
||||
strides = [stride] + [1] * (num_blocks - 1)
|
||||
layers = []
|
||||
for stride in strides:
|
||||
layers.append(block(self.in_channels, out_channels, stride))
|
||||
self.in_channels = out_channels * block.expansion
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x=torch.unsqueeze(x,1)
|
||||
output = self.conv1(x)
|
||||
output = self.conv2_x(output)
|
||||
output = self.conv3_x(output)
|
||||
output = self.conv4_x(output)
|
||||
output = self.conv5_x(output)
|
||||
output = self.avg_pool(output)
|
||||
output = output.view(output.size(0), -1)
|
||||
output = self.fc(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
# def rsnet18():
|
||||
# """ return a RSNet 18 object
|
||||
# """
|
||||
# return RSNet(BasicBlock, [2, 2, 2, 2])
|
||||
|
||||
|
||||
# def rsnet34():
|
||||
# """ return a RSNet 34 object
|
||||
# """
|
||||
# return RSNet(BasicBlock, [3, 4, 6, 3])
|
||||
|
||||
if __name__=='__main__':
|
||||
import torchinfo
|
||||
model = SpectralModel()
|
||||
model.eval()
|
||||
# print(model)
|
||||
input = torch.randn(64,224)
|
||||
y = model(input)
|
||||
print(y.size())
|
||||
|
||||
torchinfo.summary(model,input_size=(64,224))
|
BIN
src/model/__pycache__/CNN_FCNN_big.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/CNN_FCNN_big.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/model/__pycache__/CNN_FCNN_small.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/CNN_FCNN_small.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
src/model/__pycache__/DRSN-CW.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/DRSN-CW.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/model/__pycache__/FCNN_DBSCAN_big.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/FCNN_DBSCAN_big.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/model/__pycache__/FCNN_DBSCAN_small.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/FCNN_DBSCAN_small.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/model/__pycache__/FNN_CARS.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/FNN_CARS.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/model/__pycache__/FNN_CARS_big.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/FNN_CARS_big.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/model/__pycache__/FNN_CARS_small.cpython-312.pyc
Normal file
BIN
src/model/__pycache__/FNN_CARS_small.cpython-312.pyc
Normal file
Binary file not shown.
@ -3,9 +3,9 @@ import torch.nn.functional as F
|
||||
import torchinfo
|
||||
import torch
|
||||
|
||||
class CNN_FCNN(nn.Module):
|
||||
class SpectralModel(nn.Module):
|
||||
def __init__(self,):
|
||||
super(CNN_FCNN, self).__init__()
|
||||
super(SpectralModel, self).__init__()
|
||||
|
||||
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=512, kernel_size=3,stride=1)
|
||||
self.Conv2d_2=nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3,stride=1)
|
||||
@ -24,7 +24,7 @@ class CNN_FCNN(nn.Module):
|
||||
self.fc_2=nn.Linear(in_features=2048,out_features=1024)
|
||||
self.fc_3=nn.Linear(in_features=1024,out_features=512)
|
||||
self.fc_4=nn.Linear(in_features=512,out_features=256)
|
||||
self.fc_5=nn.Linear(in_features=256,out_features=4)
|
||||
self.fc_5=nn.Linear(in_features=256,out_features=8)
|
||||
|
||||
|
||||
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
|
||||
@ -71,6 +71,6 @@ class CNN_FCNN(nn.Module):
|
||||
return x
|
||||
|
||||
if __name__=="__main__":
|
||||
model=CNN_FCNN()
|
||||
model=SpectralModel()
|
||||
|
||||
torchinfo.summary(model,input_size=(64, 48, 224, 10))
|
||||
torchinfo.summary(model,input_size=(64, 20, 224, 10))
|
78
src/model/history/CNN_FCNN_small.py
Normal file
78
src/model/history/CNN_FCNN_small.py
Normal file
@ -0,0 +1,78 @@
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchinfo
|
||||
import torch
|
||||
|
||||
class SpectralModel(nn.Module):
|
||||
def __init__(self,):
|
||||
super(SpectralModel, self).__init__()
|
||||
|
||||
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3,stride=1)
|
||||
self.Conv2d_2=nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3,stride=1)
|
||||
# self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
|
||||
self.Conv2d_3=nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3,stride=1)
|
||||
|
||||
self.Conv2d_4=nn.Conv2d(in_channels=16, out_channels=8, kernel_size=3,stride=1)
|
||||
self.Conv2d_5=nn.Conv2d(in_channels=8, out_channels=4, kernel_size=2,stride=1)
|
||||
self.flatten=nn.Flatten(start_dim=1, end_dim=3)
|
||||
|
||||
|
||||
|
||||
self.fc_1=nn.Linear(in_features=860,out_features=512) #2048->1024
|
||||
|
||||
self.fc_2=nn.Linear(in_features=512,out_features=256)
|
||||
self.fc_3=nn.Linear(in_features=256,out_features=128)
|
||||
self.fc_4=nn.Linear(in_features=128,out_features=64)
|
||||
self.fc_5=nn.Linear(in_features=64,out_features=8)
|
||||
|
||||
|
||||
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
# input_shape=x.shape
|
||||
|
||||
x=torch.mean(x,dim=1,keepdim=True)
|
||||
|
||||
# x=torch.unsqueeze(x, 1)
|
||||
x=F.tanh(self.Conv2d_1(x))
|
||||
x=F.tanh(self.Conv2d_2(x))
|
||||
# x=self.MaxPool2d_1(x)
|
||||
x=F.tanh(self.Conv2d_3(x))
|
||||
x=F.tanh(self.Conv2d_4(x))
|
||||
x=F.tanh(self.Conv2d_5(x))
|
||||
|
||||
x=self.flatten(x)
|
||||
print(x.shape)
|
||||
x=F.tanh(self.fc_1(x))
|
||||
x=F.tanh(self.fc_2(x))
|
||||
x=F.tanh(self.fc_3(x))
|
||||
x=F.tanh(self.fc_4(x))
|
||||
x=F.sigmoid(self.fc_5(x))
|
||||
|
||||
#
|
||||
# x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
|
||||
# x=nn.Flatten(start_dim=2, end_dim=4)(x)
|
||||
|
||||
|
||||
# x,_=self.lstm_1(x)
|
||||
|
||||
# x=x[:,-1,:]
|
||||
|
||||
|
||||
# x=nn.LeakyReLU()(self.fc_1(x))
|
||||
# x=nn.LeakyReLU()(self.fc_2(x))
|
||||
# x=nn.Sigmoid()(self.fc_3(x))
|
||||
|
||||
|
||||
|
||||
return x
|
||||
|
||||
if __name__=="__main__":
|
||||
model=SpectralModel()
|
||||
|
||||
torchinfo.summary(model,input_size=(64, 20, 224, 10))
|
@ -3,9 +3,9 @@ import torch.nn.functional as F
|
||||
import torchinfo
|
||||
import torch
|
||||
|
||||
class CNN_LSTM_FCNN(nn.Module):
|
||||
class SpectralModel(nn.Module):
|
||||
def __init__(self,):
|
||||
super(CNN_LSTM_FCNN, self).__init__()
|
||||
super(SpectralModel, self).__init__()
|
||||
|
||||
# self.reshape_1=nn.Flatten(start_dim=0, end_dim=1)
|
||||
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=256, kernel_size=3,stride=1)
|
||||
@ -27,7 +27,7 @@ class CNN_LSTM_FCNN(nn.Module):
|
||||
self.fc_1=nn.Linear(in_features=2048,out_features=1024) #2048->1024
|
||||
|
||||
self.fc_2=nn.Linear(in_features=1024,out_features=256)
|
||||
self.fc_3=nn.Linear(in_features=256,out_features=4)
|
||||
self.fc_3=nn.Linear(in_features=256,out_features=8)
|
||||
|
||||
|
||||
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
|
||||
@ -64,6 +64,6 @@ class CNN_LSTM_FCNN(nn.Module):
|
||||
return x
|
||||
|
||||
if __name__=="__main__":
|
||||
model=CNN_LSTM_FCNN()
|
||||
model=SpectralModel()
|
||||
|
||||
torchinfo.summary(model,input_size=(64, 48, 224, 10))
|
||||
torchinfo.summary(model,input_size=(64, 20, 224, 10))
|
55
src/model/history/FCNN_DBSCAN_big.py
Normal file
55
src/model/history/FCNN_DBSCAN_big.py
Normal file
@ -0,0 +1,55 @@
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchinfo
|
||||
import torch
|
||||
|
||||
class SpectralModel(nn.Module):
|
||||
def __init__(self,):
|
||||
super(SpectralModel, self).__init__()
|
||||
self.norm=nn.BatchNorm1d(224)
|
||||
self.fc1 = nn.Linear(224, 1024)
|
||||
self.fc2 = nn.Linear(1024, 2048)
|
||||
self.fc3 = nn.Linear(2048, 4096)
|
||||
self.fc4 = nn.Linear(4096, 2048)
|
||||
self.fc5 = nn.Linear(2048, 1024)
|
||||
self.fc6 = nn.Linear(1024, 512)
|
||||
|
||||
self.fc7 = nn.Linear(512, 128)
|
||||
self.fc8 = nn.Linear(128, 8)
|
||||
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x=self.norm(x)
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = F.relu(self.fc3(x))
|
||||
x = F.relu(self.fc4(x))
|
||||
x = F.relu(self.fc5(x))
|
||||
x = F.relu(self.fc6(x))
|
||||
x = F.relu(self.fc7(x))
|
||||
x = F.relu(self.fc8(x))
|
||||
x = F.sigmoid(x)
|
||||
return x
|
||||
# class SpectralModel(nn.Module):
|
||||
# def __init__(self,):
|
||||
# super(SpectralModel, self).__init__()
|
||||
|
||||
# self.norm=nn.BatchNorm1d(224)
|
||||
# self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)])
|
||||
|
||||
|
||||
|
||||
|
||||
# def forward(self, x):
|
||||
# x=self.norm(x)
|
||||
|
||||
# res = [submodel(x) for submodel in self.submodels]
|
||||
# x=torch.cat(res,dim=1)
|
||||
|
||||
# return x
|
||||
|
||||
if __name__=="__main__":
|
||||
model=SpectralModel().to("cuda:0")
|
||||
|
||||
torchinfo.summary(model,input_size=(64, 224))
|
68
src/model/history/FCNN_DBSCAN_small.py
Normal file
68
src/model/history/FCNN_DBSCAN_small.py
Normal file
@ -0,0 +1,68 @@
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchinfo
|
||||
import torch
|
||||
|
||||
class SmallFCNNModel(nn.Module):
|
||||
def __init__(self,):
|
||||
super(SmallFCNNModel, self).__init__()
|
||||
|
||||
self.fc1 = nn.Linear(224, 50)
|
||||
|
||||
self.fc9 = nn.Linear(50, 1)
|
||||
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
# x=self.norm(x)
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.sigmoid(self.fc9(x))
|
||||
return x
|
||||
class SpectralModel(nn.Module):
|
||||
def __init__(self,):
|
||||
super(SpectralModel, self).__init__()
|
||||
|
||||
self.norm=nn.BatchNorm1d(224)
|
||||
# self.fc1 = nn.Linear(100, 50)
|
||||
# # self.fc2 = nn.Linear(1024, 2048)
|
||||
# # self.fc3 = nn.Linear(2048, 4096)
|
||||
# # self.fc4 = nn.Linear(4096, 2048)
|
||||
# # self.fc5 = nn.Linear(2048, 1024)
|
||||
# # self.fc6 = nn.Linear(1024, 512)
|
||||
# # self.fc7 = nn.Linear(512, 256)
|
||||
# # self.fc8 = nn.Linear(256, 128)
|
||||
# self.fc9 = nn.Linear(50, 8)
|
||||
# # self.fc4 = nn.Linear(10240, 4)
|
||||
self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)])
|
||||
|
||||
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x=self.norm(x)
|
||||
|
||||
res = [submodel(x) for submodel in self.submodels]
|
||||
# res=[self.submodels[0](x)]
|
||||
|
||||
# for i in range(1,len(self.submodels)):
|
||||
# res.append(self.submodels[i](0*x))
|
||||
|
||||
|
||||
|
||||
|
||||
# x = F.relu(self.fc2(x))
|
||||
# x = F.relu(self.fc3(x))
|
||||
# x = F.relu(self.fc4(x))
|
||||
# x = F.relu(self.fc5(x))
|
||||
# x = F.relu(self.fc6(x))
|
||||
# x = F.relu(self.fc7(x))
|
||||
# x = F.relu(self.fc8(x))
|
||||
# x = F.sigmoid(self.fc9(x))
|
||||
x=torch.cat(res,dim=1)
|
||||
|
||||
return x
|
||||
|
||||
if __name__=="__main__":
|
||||
model=SpectralModel().to("cuda:0")
|
||||
|
||||
torchinfo.summary(model,input_size=(64, 224))
|
Binary file not shown.
BIN
src/optimizer/__pycache__/utils.cpython-312.pyc
Normal file
BIN
src/optimizer/__pycache__/utils.cpython-312.pyc
Normal file
Binary file not shown.
@ -22,6 +22,8 @@ import numpy as np
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from optimizer.utils import save_inference_files
|
||||
|
||||
logger = logging.getLogger("ray")
|
||||
|
||||
def trainable(hydra_cfg,tune_cfg):
|
||||
@ -39,7 +41,7 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
## 原始数据预处理为数据集 ##
|
||||
############################
|
||||
t_1=time.time()
|
||||
data_preprocess=DataPreProcess( hydra_cfg.raw_spectral_data_dir,hydra_cfg.raw_labels_dir,hydra_cfg.labels_name,hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]),hydra_cfg.dataset.train_ratio,hydra_cfg.dataset.validate_ratio)
|
||||
data_preprocess=DataPreProcess( hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]))
|
||||
t_2=time.time()
|
||||
logger.info(f"Preprocessed raw data costs {t_2-t_1}s")
|
||||
|
||||
@ -47,12 +49,18 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
## 加载数据集为dataloader ##
|
||||
############################
|
||||
|
||||
train_dataset,val_dataset=load_dataset(data_preprocess.dataset_save_dir)
|
||||
train_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="training")
|
||||
val_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="validation")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
trainloader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
|
||||
train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
|
||||
)
|
||||
valloader = torch.utils.data.DataLoader(
|
||||
val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
|
||||
val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
|
||||
)
|
||||
t_3=time.time()
|
||||
logger.info(f"Dataloader costs {t_3-t_2}s")
|
||||
@ -101,7 +109,7 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
## 生成模型架构图写入tensorboad ##
|
||||
################################
|
||||
sample = train_dataset[0:4][0]
|
||||
print(sample.shape)
|
||||
# print(sample.shape)
|
||||
writer.add_graph(model, torch.tensor(sample).to(device))
|
||||
|
||||
|
||||
@ -113,6 +121,7 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
################################
|
||||
## 模型训练 ##
|
||||
################################
|
||||
|
||||
logger.info(f"Start epoch {epoch+1}")
|
||||
train_loss = 0.0
|
||||
train_steps = 0
|
||||
@ -121,12 +130,29 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
train_errors=None
|
||||
train_outputs=None
|
||||
train_labels=None
|
||||
train_bounds=None
|
||||
model.train()
|
||||
|
||||
# t_1=time.time()
|
||||
# for batch, (features, labels) in enumerate(trainloader):
|
||||
# features=features.to(device)
|
||||
# labels=labels.to(device)
|
||||
|
||||
# t_2=time.time()
|
||||
# print(f"DEBUDING cost{t_2-t_1}s")
|
||||
|
||||
for batch, (features, labels) in enumerate(trainloader):
|
||||
|
||||
|
||||
|
||||
t_iteration_start=time.time()
|
||||
features=features.to(device)
|
||||
labels=labels.to(device)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# zero the parameter gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
@ -136,9 +162,13 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
|
||||
labels_npy=labels.cpu().detach().numpy()
|
||||
outputs_npy=outputs.cpu().detach().numpy()
|
||||
# print("outputs_npy",outputs_npy)
|
||||
# print("labels_npy",labels_npy)
|
||||
|
||||
# 生成自定义的指标
|
||||
error,hit_rate=data_preprocess.get_metric(outputs_npy, labels_npy)
|
||||
outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
|
||||
# print("outputs_npy_after",outputs_npy)
|
||||
# print("labels_npy_after",labels_npy)
|
||||
|
||||
# Backpropagation
|
||||
loss.backward()
|
||||
@ -147,16 +177,21 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
# 记录我们自定义的指标
|
||||
train_loss += loss.item()
|
||||
train_steps += 1
|
||||
|
||||
if train_steps==1:
|
||||
train_hit_rate=hit_rate
|
||||
train_errors=error
|
||||
train_outputs=outputs_npy
|
||||
train_labels=labels_npy
|
||||
train_bounds=bounds
|
||||
else:
|
||||
|
||||
|
||||
train_hit_rate=(train_steps-1)/train_steps *train_hit_rate+ 1/train_steps*hit_rate
|
||||
train_errors=np.concatenate((train_errors,error),axis=0)
|
||||
train_outputs=np.concatenate((train_outputs,outputs_npy),axis=0)
|
||||
train_labels=np.concatenate((train_labels,labels_npy),axis=0)
|
||||
train_bounds=np.concatenate((train_bounds,bounds),axis=0)
|
||||
|
||||
t_iteration_end=time.time()
|
||||
print("Training Epoch:[{}/{}],Iteration:{}/{},Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_steps,len(trainloader),loss.item(),t_iteration_end-t_iteration_start))
|
||||
@ -168,6 +203,9 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
|
||||
val_loss = 0.0
|
||||
val_steps = 0
|
||||
val_errors=None
|
||||
val_outputs=None
|
||||
val_labels=None
|
||||
val_hit_rate=None
|
||||
|
||||
t_epoch_train_end=time.time()
|
||||
@ -177,20 +215,30 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
for batch, (features, labels) in enumerate(valloader):
|
||||
t_iteration_start=time.time()
|
||||
with torch.no_grad():
|
||||
|
||||
features=features.to(device)
|
||||
labels=labels.to(device)
|
||||
|
||||
outputs = model(features)
|
||||
loss = criterion(outputs, labels)
|
||||
|
||||
error,hit_rate=data_preprocess.get_metric(outputs.cpu().detach().numpy(), labels.cpu().detach().numpy())
|
||||
labels_npy=labels.cpu().detach().numpy()
|
||||
outputs_npy=outputs.cpu().detach().numpy()
|
||||
|
||||
outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
|
||||
|
||||
val_loss += loss.cpu().numpy()
|
||||
val_steps += 1
|
||||
if val_steps==1:
|
||||
val_hit_rate=hit_rate
|
||||
val_errors=error
|
||||
val_outputs=outputs_npy
|
||||
val_labels=labels_npy
|
||||
else:
|
||||
val_hit_rate=(val_steps-1)/val_steps *val_hit_rate+ 1/val_steps*hit_rate
|
||||
val_errors=np.concatenate((val_errors,error),axis=0)
|
||||
val_outputs=np.concatenate((val_outputs,outputs_npy),axis=0)
|
||||
val_labels=np.concatenate((val_labels,labels_npy),axis=0)
|
||||
t_iteration_end=time.time()
|
||||
print("Validate Iteration:{}/{},Loss:{}, Cost {}s".format(val_steps,len(valloader),loss.item(),t_iteration_end-t_iteration_start))
|
||||
|
||||
@ -206,41 +254,157 @@ def trainable(hydra_cfg,tune_cfg):
|
||||
|
||||
|
||||
|
||||
checkpoint_data = {
|
||||
"epoch": epoch,
|
||||
"net_state_dict": model.state_dict(),
|
||||
"optimizer_state_dict": optimizer.state_dict(),
|
||||
}
|
||||
|
||||
|
||||
with tempfile.TemporaryDirectory() as checkpoint_dir:
|
||||
data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
|
||||
with open(data_path, "wb") as fp:
|
||||
pickle.dump(checkpoint_data, fp)
|
||||
|
||||
checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)
|
||||
|
||||
reports={"val_loss": val_loss / val_steps,
|
||||
"train_loss": train_loss,
|
||||
}
|
||||
for i,name in enumerate(hydra_cfg.labels_name):
|
||||
reports[f"train_hit_rate_{name}"]=train_hit_rate[i]
|
||||
reports[f"val_hit_rate_{name}"]=val_hit_rate[i]
|
||||
writer.add_histogram(tag=f'train_error_{name}', values=train_errors[:,i], global_step=epoch)
|
||||
|
||||
for i,name in enumerate(data_preprocess.labelNames):
|
||||
# reports[=
|
||||
writer.add_scalar(f"{name}/training/hit_rate",train_hit_rate[i],global_step=epoch)
|
||||
writer.add_scalar(f"{name}/validation/hit_rate",val_hit_rate[i],global_step=epoch)
|
||||
|
||||
if (epoch!=0 and epoch%hydra_cfg.train.checkpoint_interval==0):
|
||||
|
||||
|
||||
for i,name in enumerate(data_preprocess.labelNames):
|
||||
# reports[=
|
||||
|
||||
writer.add_histogram(tag=f'{name}/training/error_histogram', values=train_errors[:,i], global_step=epoch)
|
||||
writer.add_histogram(tag=f'{name}/validation/error_histogram', values=val_errors[:,i], global_step=epoch)
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.scatter(np.arange(train_outputs.shape[0]),train_outputs[:,i])
|
||||
ax.scatter(np.arange(train_outputs.shape[0]),train_labels[:,i])
|
||||
writer.add_figure(f'train_imgage_{name}', fig , epoch)
|
||||
ax.scatter(train_labels[:,i],train_outputs[:,i])
|
||||
ax.plot([train_labels[:,i].min(), train_labels[:,i].min()], [train_labels[:,i].max(), train_labels[:,i].max()], 'r--')
|
||||
ax.plot(train_labels[:,i],train_labels[:,i]-train_bounds[:,i], 'r--')
|
||||
ax.plot(train_labels[:,i],train_labels[:,i]+train_bounds[:,i], 'r--')
|
||||
|
||||
|
||||
ax.set_xlabel('Actual Values')
|
||||
ax.set_ylabel('Estimated Values')
|
||||
writer.add_figure(f'{name}/training/actual_vs_estimated_value', fig , epoch)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.scatter(val_labels[:,i],val_outputs[:,i])
|
||||
ax.plot([val_labels[:,i].min(), val_labels[:,i].max()], [val_outputs[:,i].min(), val_outputs[:,i].max()], 'r--')
|
||||
ax.set_xlabel('Actual Values')
|
||||
ax.set_ylabel('Estimated Values')
|
||||
writer.add_figure(f'{name}/validation/actual_vs_estimated_value', fig , epoch)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.scatter(train_labels[:,i],train_errors[:,i])
|
||||
ax.plot(train_labels[:,i],[0 for i in range(train_labels.shape[0])], color='r', linestyle='--')
|
||||
ax.plot(train_labels[:,i],-train_bounds[:,i], 'r--')
|
||||
ax.plot(train_labels[:,i],+train_bounds[:,i], 'r--')
|
||||
ax.set_xlabel('Actual Values')
|
||||
ax.set_ylabel('Residual error')
|
||||
|
||||
writer.add_figure(f'{name}/training/training_actual_vs_residual', fig , epoch)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.scatter(val_labels[:,i],val_errors[:,i])
|
||||
ax.plot(val_labels[:,i],[0 for i in range(val_labels.shape[0])], color='r', linestyle='--')
|
||||
ax.set_xlabel('Actual Values')
|
||||
ax.set_ylabel('Residual error')
|
||||
writer.add_figure(f'{name}/validation/training_actual_vs_Residual', fig , epoch)
|
||||
plt.close(fig)
|
||||
|
||||
with tempfile.TemporaryDirectory() as checkpoint_dir:
|
||||
checkpoint_data = {
|
||||
"epoch": epoch,
|
||||
"net_state_dict": model.state_dict(),
|
||||
"optimizer_state_dict": optimizer.state_dict(),
|
||||
"data_preprocess":data_preprocess
|
||||
|
||||
}
|
||||
data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
|
||||
with open(data_path, "wb") as fp:
|
||||
pickle.dump(checkpoint_data, fp)
|
||||
|
||||
save_inference_files(model,data_preprocess,checkpoint_dir)
|
||||
|
||||
checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)
|
||||
ray.train.report(
|
||||
reports,
|
||||
checkpoint=checkpoint,
|
||||
)
|
||||
else:
|
||||
ray.train.report(reports,)
|
||||
t_epoch_end=time.time()
|
||||
logger.info(f"Save Checkpoint costs {t_epoch_end-t_epoch_val_end}s")
|
||||
|
||||
print("Training Epoch:[{}/{}], Average Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_loss,t_epoch_end-t_epoch_start))
|
||||
for i,name in enumerate(hydra_cfg.labels_name):
|
||||
print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end=" ")
|
||||
print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end=" ")
|
||||
# for i,name in enumerate(hydra_cfg.labels_name):
|
||||
# print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end=" ")
|
||||
# print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end=" ")
|
||||
|
||||
#####################################################
|
||||
## 每个epoch保存checkpoint,并记录到tensorboard ##
|
||||
#####################################################
|
||||
|
||||
#测试
|
||||
|
||||
test_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="test")
|
||||
|
||||
testloader = torch.utils.data.DataLoader(
|
||||
test_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
test_loss = 0.0
|
||||
test_steps = 0
|
||||
test_errors=None
|
||||
test_outputs=None
|
||||
test_labels=None
|
||||
test_hit_rate=None
|
||||
logger.info(f"Test starts")
|
||||
t_epoch_test_start=time.time()
|
||||
|
||||
model.eval()
|
||||
|
||||
for batch, (features, labels) in enumerate(testloader):
|
||||
t_iteration_start=time.time()
|
||||
with torch.no_grad():
|
||||
features=features.to(device)
|
||||
labels=labels.to(device)
|
||||
|
||||
outputs = model(features)
|
||||
loss = criterion(outputs, labels)
|
||||
|
||||
labels_npy=labels.cpu().detach().numpy()
|
||||
outputs_npy=outputs.cpu().detach().numpy()
|
||||
|
||||
outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
|
||||
|
||||
test_loss += loss.cpu().numpy()
|
||||
test_steps += 1
|
||||
if test_steps==1:
|
||||
test_hit_rate=hit_rate
|
||||
test_errors=error
|
||||
test_outputs=outputs_npy
|
||||
test_labels=labels_npy
|
||||
else:
|
||||
test_hit_rate=(test_steps-1)/test_steps *test_hit_rate+ 1/test_steps*hit_rate
|
||||
test_errors=np.concatenate((test_errors,error),axis=0)
|
||||
test_outputs=np.concatenate((test_outputs,outputs_npy),axis=0)
|
||||
test_labels=np.concatenate((test_labels,labels_npy),axis=0)
|
||||
t_iteration_end=time.time()
|
||||
print("Test Iteration:{}/{},Loss:{}, Cost {}s".format(test_steps,len(testloader),loss.item(),t_iteration_end-t_iteration_start))
|
||||
|
||||
t_epoch_test_end=time.time()
|
||||
logger.info(f"Test costs {t_epoch_test_end-t_epoch_test_start}s")
|
||||
tmp_string=""
|
||||
for i,name in enumerate(data_preprocess.labelNames):
|
||||
tmp_string+=f"{name}\t:{test_hit_rate[i]*100:.2f}%\n"
|
||||
writer.add_text(f"test/hit_rate",tmp_string)
|
||||
|
48
src/optimizer/utils.py
Normal file
48
src/optimizer/utils.py
Normal file
@ -0,0 +1,48 @@
|
||||
import shutil
|
||||
import pathlib
|
||||
import pickle
|
||||
import torch
|
||||
def save_inference_files(model,data_preprocess,checkpoint_dir):
|
||||
sava_dir=pathlib.Path(checkpoint_dir)/"inference"
|
||||
print("开始在checkpoint中保存推理所需文件")
|
||||
print("choose_frame_spatial文件路径:",data_preprocess.choose_frame_spatial.file_path.parent)
|
||||
|
||||
#保存choose_frame_spatial相关文件
|
||||
|
||||
|
||||
sava_dir.mkdir(mode=0o777, parents=True, exist_ok=True)
|
||||
|
||||
shutil.copy(data_preprocess.choose_frame_spatial.file_path,sava_dir/"choose_frame_spatial.py")
|
||||
with open(sava_dir/"choose_frame_spatial.pkl",'wb') as f:
|
||||
pickle.dump(data_preprocess.choose_frame_spatial.state_dict(),f)
|
||||
|
||||
|
||||
shutil.copy(data_preprocess.features_scaling.file_path,sava_dir/"features_scaling.py")
|
||||
with open(sava_dir/"features_scaling.pkl",'wb') as f:
|
||||
pickle.dump(data_preprocess.features_scaling.state_dict(),f)
|
||||
|
||||
|
||||
shutil.copy(data_preprocess.labels_scaling.file_path,sava_dir/"labels_scaling.py")
|
||||
with open(sava_dir/"labels_scaling.pkl",'wb') as f:
|
||||
pickle.dump(data_preprocess.labels_scaling.state_dict(),f)
|
||||
|
||||
with open(sava_dir/"labelNames.pkl",'wb') as f:
|
||||
pickle.dump(data_preprocess.labelNames,f)
|
||||
|
||||
|
||||
input_tensor = torch.rand((1,*data_preprocess.features_scaling.feature_shape), dtype=torch.float32).to("cuda:0")
|
||||
torch.onnx.export(
|
||||
model, # model to export
|
||||
(input_tensor,), # inputs of the model,
|
||||
str(sava_dir/"model.onnx"), # filename of the ONNX model
|
||||
input_names=["input"], # Rename inputs for the ONNX model
|
||||
dynamo=True # True or False to select the exporter to use
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
149
src/scripts/Add_Lab.py
Normal file
149
src/scripts/Add_Lab.py
Normal file
@ -0,0 +1,149 @@
|
||||
import pandas as pd
|
||||
# from tqdm import tqdm
|
||||
if __name__ =="__main__":
|
||||
|
||||
label_file_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335.xlsx"
|
||||
raw_label_lab_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份转炉TSC和TSO化验结果.xls"
|
||||
|
||||
save_dir="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx"
|
||||
|
||||
labels=pd.read_excel(label_file_path)
|
||||
|
||||
print(f"原始数据量{labels.shape[0]}")
|
||||
labels=labels.loc[:,["Furnace_Number","Steel_Type","TSC_start_time","TSC_end_time","TSC_T","TSC_C","TSO_start_time","TSO_end_time","TSO_T","TSO_C"]]
|
||||
|
||||
#下述功能集成到数据预处理中。因为每一行但凡有0就删,太严格,应该要求指定的label有0就删
|
||||
# # 选出有NULL的行
|
||||
null_rows=labels.isnull().any(axis=1)
|
||||
# # 选出有0的行
|
||||
zeros_rows=(labels==0).any(axis=1) | (labels=='0').any(axis=1)
|
||||
|
||||
# # 每一行但凡有NULL或者0都给删了
|
||||
selected_rows=~(null_rows|zeros_rows)
|
||||
labels=labels[selected_rows]
|
||||
|
||||
print(f"删除无效数据后{labels.shape[0]}")
|
||||
|
||||
labels_output=labels.copy()
|
||||
|
||||
labels_output['TSC_P']=0.0
|
||||
labels_output['TSC_S']=0.0
|
||||
labels_output['TSC_Mn']=0.0
|
||||
labels_output['TSC_Ni']=0.0
|
||||
labels_output['TSC_Mo']=0.0
|
||||
labels_output['TSC_Cr']=0.0
|
||||
labels_output['TSO_P']=0.0
|
||||
labels_output['TSO_S']=0.0
|
||||
labels_output['TSO_Mn']=0.0
|
||||
labels_output['TSO_Ni']=0.0
|
||||
labels_output['TSO_Mo']=0.0
|
||||
labels_output['TSO_Cr']=0.0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
raw_label_lab=pd.read_excel(raw_label_lab_file_path)
|
||||
raw_label_lab=raw_label_lab.loc[:,["炉次号","分析时间","Mn","P","S","Ni","Cr","Mo"]]
|
||||
# print(raw_label_lab)
|
||||
|
||||
for i in range(labels.shape[0]):
|
||||
# i=4
|
||||
furnaceID=labels.iloc[i]["Furnace_Number"]
|
||||
|
||||
# print(raw_label_lab[raw_label_lab["炉次号"]==furnaceID])
|
||||
|
||||
tmp=raw_label_lab[raw_label_lab["炉次号"]==furnaceID]
|
||||
|
||||
if tmp.shape[0]==0:
|
||||
print(f"炉次号{furnaceID}找不到对应的数据")
|
||||
continue
|
||||
elif tmp.shape[0]==1:
|
||||
print(f"炉次号{furnaceID}找到1个")
|
||||
labels_output.loc[i,'TSC_P']=tmp.iloc[0]["P"]
|
||||
labels_output.loc[i,'TSC_S']=tmp.iloc[0]["S"]
|
||||
labels_output.loc[i,'TSC_Mn']=tmp.iloc[0]["Mn"]
|
||||
labels_output.loc[i,'TSC_Ni']=tmp.iloc[0]["Ni"]
|
||||
labels_output.loc[i,'TSC_Mo']=tmp.iloc[0]["Mo"]
|
||||
labels_output.loc[i,'TSC_Cr']=tmp.iloc[0]["Cr"]
|
||||
labels_output.loc[i,'TSO_P']=tmp.iloc[0]["P"]
|
||||
labels_output.loc[i,'TSO_S']=tmp.iloc[0]["S"]
|
||||
labels_output.loc[i,'TSO_Mn']=tmp.iloc[0]["Mn"]
|
||||
labels_output.loc[i,'TSO_Ni']=tmp.iloc[0]["Ni"]
|
||||
labels_output.loc[i,'TSO_Mo']=tmp.iloc[0]["Mo"]
|
||||
labels_output.loc[i,'TSO_Cr']=tmp.iloc[0]["Cr"]
|
||||
else:
|
||||
print(f"炉次号{furnaceID}找到{tmp.shape[0]}个")
|
||||
|
||||
#寻找离TSC最近的
|
||||
min_time=None
|
||||
min_index=0
|
||||
for j in range(tmp.shape[0]):
|
||||
# print(j,i)
|
||||
delta_time=tmp.iloc[j]["分析时间"]-labels.iloc[i]["TSC_end_time"]
|
||||
if min_time is None:
|
||||
min_time=delta_time
|
||||
else:
|
||||
if delta_time<min_time:
|
||||
min_time=delta_time
|
||||
min_index=j
|
||||
# print(min_time,min_index)
|
||||
|
||||
labels_output.loc[i,'TSC_P']=tmp.iloc[min_index]["P"]
|
||||
labels_output.loc[i,'TSC_S']=tmp.iloc[min_index]["S"]
|
||||
labels_output.loc[i,'TSC_Mn']=tmp.iloc[min_index]["Mn"]
|
||||
labels_output.loc[i,'TSC_Ni']=tmp.iloc[min_index]["Ni"]
|
||||
labels_output.loc[i,'TSC_Mo']=tmp.iloc[min_index]["Mo"]
|
||||
labels_output.loc[i,'TSC_Cr']=tmp.iloc[min_index]["Cr"]
|
||||
# labels.iloc[i]['TSO_P']=tmp.iloc[i]["P"]
|
||||
# labels.iloc[i]['TSO_S']=tmp.iloc[i]["S"]
|
||||
# labels.iloc[i]['TSO_Mn']=tmp.iloc[i]["Mn"]
|
||||
# labels.iloc[i]['TSO_Ni']=tmp.iloc[i]["Ni"]
|
||||
# labels.iloc[i]['TSO_Mo']=tmp.iloc[i]["Mo"]
|
||||
# labels.iloc[i]['TSO_Cr']=tmp.iloc[i]["Cr"]
|
||||
|
||||
#寻找离TSO最近的
|
||||
min_time=None
|
||||
min_index=0
|
||||
for j in range(tmp.shape[0]):
|
||||
# print(j,i)
|
||||
# print(tmp.iloc[j]["分析时间"],labels.iloc[i]["TSO_end_time"])
|
||||
delta_time=tmp.iloc[j]["分析时间"]-labels.iloc[i]["TSO_end_time"]
|
||||
if min_time is None:
|
||||
min_time=delta_time
|
||||
else:
|
||||
if delta_time<min_time:
|
||||
min_time=delta_time
|
||||
min_index=j
|
||||
# print(min_time,min_index)
|
||||
|
||||
labels_output.loc[i,'TSO_P']=tmp.iloc[min_index]["P"]
|
||||
labels_output.loc[i,'TSO_S']=tmp.iloc[min_index]["S"]
|
||||
labels_output.loc[i,'TSO_Mn']=tmp.iloc[min_index]["Mn"]
|
||||
labels_output.loc[i,'TSO_Ni']=tmp.iloc[min_index]["Ni"]
|
||||
labels_output.loc[i,'TSO_Mo']=tmp.iloc[min_index]["Mo"]
|
||||
labels_output.loc[i,'TSO_Cr']=tmp.iloc[min_index]["Cr"]
|
||||
|
||||
null_rows=labels_output.isnull().any(axis=1)
|
||||
# # 选出有0的行
|
||||
zeros_rows=(labels_output==0).any(axis=1) | (labels_output=='0').any(axis=1)
|
||||
|
||||
# # 每一行但凡有NULL或者0都给删了
|
||||
selected_rows=~(null_rows|zeros_rows)
|
||||
labels_output=labels_output[selected_rows]
|
||||
print(labels_output)
|
||||
print(f"插入实验室数据的后的大小{labels_output.shape[0]}")
|
||||
|
||||
labels_output.to_excel(save_dir,index=False)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# for i in range(labels.shape[0]):
|
||||
# print(f"[{i+1}/{labels.shape[0]}],炉次号:{labels.iloc[i]["Furnace_Number"]}")
|
||||
# furnaceID=labels.iloc[i]["Furnace_Number"]
|
@ -16,9 +16,10 @@ import os
|
||||
|
||||
if __name__ =="__main__":
|
||||
|
||||
raw_label_file_path="/home/admin/20240806-NanEr-5-8-data/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
|
||||
raw_label_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
|
||||
# raw_label_lab_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份转炉TSC和TSO化验结果.xls"
|
||||
|
||||
save_dir="/home/admin/20240806-NanEr-5-8-data/labels/NanEr"
|
||||
save_dir="/data/SEMS-model-training/labels/NanEr"
|
||||
|
||||
select_and_rename={
|
||||
"炉号":"Furnace_Number",
|
||||
|
60
src/scripts/pre_dataset_visual.py
Normal file
60
src/scripts/pre_dataset_visual.py
Normal file
@ -0,0 +1,60 @@
|
||||
import multiprocessing.pool
|
||||
import pathlib
|
||||
import os
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.decomposition import KernelPCA
|
||||
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn import metrics
|
||||
from sklearn.cluster import OPTICS
|
||||
|
||||
import multiprocessing
|
||||
|
||||
|
||||
def plot_npz(file_path:pathlib.Path,output_folder:pathlib.Path)->None:
|
||||
|
||||
output_folder=output_folder
|
||||
output_folder.mkdir(mode=0o777,parents=True,exist_ok=True)
|
||||
|
||||
data=np.load(file_path,allow_pickle=True)
|
||||
#此处data["rawSpectralData"]的维度为(时间维度,光谱维度,空间维度)
|
||||
rawSpectralData=data["rawSpectralData"]
|
||||
|
||||
|
||||
|
||||
plt.figure()
|
||||
spectrum_band=np.linspace(400,1000,224)
|
||||
# plt.ylim([0,4095])
|
||||
plt.plot(spectrum_band,rawSpectralData)
|
||||
# plt.title("Max Point Spectrum")
|
||||
plt.savefig(output_folder/f"{file_path.stem}.png",dpi=100)
|
||||
plt.close()
|
||||
#画最强点的光谱
|
||||
|
||||
|
||||
|
||||
def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None:
|
||||
|
||||
files=list(raw_data_folder.glob("*.npz"))
|
||||
|
||||
# plot_npz(files[0],output_folder )
|
||||
|
||||
p=multiprocessing.Pool(4)
|
||||
|
||||
for i in range(len(files)):
|
||||
p.apply_async(plot_npz, args=(files[i],output_folder))
|
||||
|
||||
print('Waiting for all subprocesses done...')
|
||||
p.close()
|
||||
p.join()
|
||||
print('All subprocesses done.')
|
||||
|
||||
if __name__=="__main__":
|
||||
raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset")
|
||||
output_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset_visual")
|
||||
main(raw_dataset_folder/"test",output_folder/"test",)
|
||||
main(raw_dataset_folder/"training",output_folder/"training",)
|
||||
main(raw_dataset_folder/"validation",output_folder/"validation",)
|
265
src/scripts/raw_dataset_visual.py
Normal file
265
src/scripts/raw_dataset_visual.py
Normal file
@ -0,0 +1,265 @@
|
||||
import multiprocessing.pool
|
||||
import pathlib
|
||||
import os
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.decomposition import KernelPCA
|
||||
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn import metrics
|
||||
from sklearn.cluster import OPTICS
|
||||
|
||||
import multiprocessing
|
||||
|
||||
|
||||
def plot_npz(file_path:pathlib.Path,output_folder:pathlib.Path)->None:
|
||||
|
||||
output_folder=output_folder/file_path.stem
|
||||
output_folder.mkdir(mode=0o777,parents=True,exist_ok=True)
|
||||
|
||||
data=np.load(file_path,allow_pickle=True)
|
||||
#此处data["rawSpectralData"]的维度为(时间维度,光谱维度,空间维度)
|
||||
rawSpectralData=data["rawSpectralData"].transpose(0, 2, 1)
|
||||
rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
|
||||
#rawSpectralData的维度为(时间维度*空间维度,光谱维度)
|
||||
|
||||
tmp=np.max(rawSpectralData,axis=1)
|
||||
rawSpectralData=rawSpectralData[(tmp>500) & (tmp<4095),:]
|
||||
#选出没有欠曝与过曝的空间点
|
||||
|
||||
|
||||
rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
|
||||
#归一化所有光谱,去除强度信息
|
||||
|
||||
|
||||
plt.figure()
|
||||
spectrum_band=np.linspace(400,1000,224)
|
||||
index=np.argmax(rawSpectralData)
|
||||
index=np.unravel_index(index, rawSpectralData.shape)
|
||||
plt.ylim([0,4095])
|
||||
plt.plot(spectrum_band,rawSpectralData[index[0],:])
|
||||
plt.title("Max Point Spectrum")
|
||||
plt.savefig(output_folder/"max_point_spectrum.png",dpi=100)
|
||||
plt.close()
|
||||
#画最强点的光谱
|
||||
|
||||
|
||||
|
||||
################################
|
||||
# PCA
|
||||
################################
|
||||
# pca_norm_1D = PCA(n_components=1)
|
||||
# norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
|
||||
# pca_norm_2D = PCA(n_components=2)
|
||||
# norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
|
||||
# pca_norm_3D = PCA(n_components=3)
|
||||
# norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
|
||||
|
||||
# pca_1D = PCA(n_components=1)
|
||||
# feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData)
|
||||
# pca_2D = PCA(n_components=2)
|
||||
# feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData)
|
||||
# pca_3D = PCA(n_components=3)
|
||||
# feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData)
|
||||
|
||||
|
||||
# fig, axs = plt.subplots(2, 3,figsize=(15,10))
|
||||
|
||||
# axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
|
||||
# axs[0, 0].set_title(' norm')
|
||||
# axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
|
||||
# ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
|
||||
# ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
|
||||
|
||||
|
||||
# axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
|
||||
# axs[1, 0].set_title(' raw')
|
||||
# axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
|
||||
# ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
|
||||
# ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
|
||||
# plt.savefig(output_folder/"PCA.png",dpi=100)
|
||||
|
||||
# plt.close()
|
||||
|
||||
|
||||
|
||||
################################
|
||||
# KernelPCA gamma =15
|
||||
################################
|
||||
|
||||
def plot_KernelPCA(gamma):
|
||||
pca_norm_1D = KernelPCA(n_components=1, kernel='rbf', gamma=gamma)
|
||||
norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
|
||||
pca_norm_2D = KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
|
||||
norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
|
||||
pca_norm_3D = KernelPCA(n_components=3, kernel='rbf', gamma=gamma)
|
||||
norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
|
||||
|
||||
pca_1D = KernelPCA(n_components=1, kernel='rbf', gamma=gamma)
|
||||
feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData)
|
||||
pca_2D = KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
|
||||
feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData)
|
||||
pca_3D = KernelPCA(n_components=3, kernel='rbf', gamma=gamma)
|
||||
feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData)
|
||||
|
||||
|
||||
fig, axs = plt.subplots(2, 3,figsize=(15,10))
|
||||
|
||||
axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
|
||||
axs[0, 0].set_title(' norm')
|
||||
axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
|
||||
ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
|
||||
ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
|
||||
|
||||
|
||||
axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
|
||||
axs[1, 0].set_title(' raw')
|
||||
axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
|
||||
ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
|
||||
ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
|
||||
plt.savefig(output_folder/f"KernelPCA_gamma_{gamma}.png",dpi=100)
|
||||
|
||||
plt.close()
|
||||
|
||||
# plot_KernelPCA(gamma=None)
|
||||
# plot_KernelPCA(gamma=1)
|
||||
# plot_KernelPCA(gamma=5)
|
||||
# plot_KernelPCA(gamma=10)
|
||||
# plot_KernelPCA(gamma=15)
|
||||
|
||||
|
||||
|
||||
################################
|
||||
# t-SNE
|
||||
################################
|
||||
# pca_norm_1D = TSNE(n_components=1, learning_rate='auto', init='pca')
|
||||
# norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
|
||||
# pca_norm_2D = TSNE(n_components=2, learning_rate='auto', init='pca')
|
||||
# norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
|
||||
# pca_norm_3D = TSNE(n_components=3, learning_rate='auto', init='pca')
|
||||
# norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
|
||||
|
||||
# pca_1D = TSNE(n_components=1, learning_rate='auto', init='pca')
|
||||
# feat_1D = pca_1D.fit(rawSpectralData).fit_transform(rawSpectralData)
|
||||
# pca_2D = TSNE(n_components=2, learning_rate='auto', init='pca')
|
||||
# feat_2D = pca_2D.fit(rawSpectralData).fit_transform(rawSpectralData)
|
||||
# pca_3D = TSNE(n_components=3, learning_rate='auto', init='pca')
|
||||
# feat_3D = pca_3D.fit(rawSpectralData).fit_transform(rawSpectralData)
|
||||
|
||||
|
||||
# fig, axs = plt.subplots(2, 3,figsize=(15,10))
|
||||
|
||||
# axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
|
||||
# axs[0, 0].set_title(' norm')
|
||||
# axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
|
||||
# ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
|
||||
# ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
|
||||
|
||||
|
||||
# axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
|
||||
# axs[1, 0].set_title(' raw')
|
||||
# axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
|
||||
# ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
|
||||
# ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
|
||||
# plt.savefig(output_folder/"t-SNE.png",dpi=100)
|
||||
|
||||
# plt.close()
|
||||
|
||||
|
||||
def plot_DBSCAN(eps=0.15, min_samples=10):
|
||||
|
||||
db_norm = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData_normed)
|
||||
|
||||
labels_norm = db_norm.labels_
|
||||
n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
|
||||
n_noise_norm = list(labels_norm).count(-1)
|
||||
|
||||
|
||||
max_i=0
|
||||
max_num=0
|
||||
for i in range(n_norm):
|
||||
tmp=(labels_norm==i).sum()
|
||||
if tmp>max_num:
|
||||
max_i=i
|
||||
max_num=tmp
|
||||
|
||||
fig, axs = plt.subplots(2, 2,figsize=(10,10))
|
||||
for i in range(labels_norm.shape[0]):
|
||||
if labels_norm[i]==max_i:
|
||||
axs[0,0].plot(rawSpectralData_normed[i])
|
||||
axs[0,0].set_title(f'norm data w norm cluster {max_num},{n_norm},{n_noise_norm}')
|
||||
|
||||
for i in range(labels_norm.shape[0]):
|
||||
if labels_norm[i]==max_i:
|
||||
axs[0,1].plot(rawSpectralData[i])
|
||||
axs[0,1].set_title('norm data w norm cluster')
|
||||
|
||||
|
||||
|
||||
|
||||
db_raw = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData)
|
||||
labels_raw = db_raw.labels_
|
||||
n_raw = len(set(labels_raw)) - (1 if -1 in labels_raw else 0)
|
||||
n_noise_raw = list(labels_raw).count(-1)
|
||||
|
||||
|
||||
max_i=0
|
||||
max_num=0
|
||||
for i in range(n_raw):
|
||||
tmp=(labels_raw==i).sum()
|
||||
if tmp>max_num:
|
||||
max_i=i
|
||||
max_num=tmp
|
||||
|
||||
for i in range(labels_raw.shape[0]):
|
||||
if labels_raw[i]==max_i:
|
||||
axs[1,0].plot(rawSpectralData_normed[i])
|
||||
axs[1,0].set_title(f'norm data w norm cluster {max_num},{n_raw},{n_noise_raw}')
|
||||
|
||||
for i in range(labels_raw.shape[0]):
|
||||
if labels_raw[i]==max_i:
|
||||
axs[1,1].plot(rawSpectralData[i])
|
||||
axs[1,1].set_title('norm data w norm cluster')
|
||||
|
||||
plt.savefig(output_folder/f"DBSCAN_eps_{eps}_min_samples_{min_samples}.png",dpi=100)
|
||||
|
||||
plt.close()
|
||||
plot_DBSCAN(eps=0.15, min_samples=10)
|
||||
# plot_DBSCAN(eps=100, min_samples=10)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# print(file_path.stem,rawSpectralData.shape)
|
||||
|
||||
|
||||
def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None:
|
||||
|
||||
files=list(raw_data_folder.glob("*.npz"))
|
||||
|
||||
# plot_npz(files[0],output_folder )
|
||||
|
||||
p=multiprocessing.Pool(4)
|
||||
|
||||
for i in range(len(files)):
|
||||
p.apply_async(plot_npz, args=(files[i],output_folder))
|
||||
|
||||
print('Waiting for all subprocesses done...')
|
||||
p.close()
|
||||
p.join()
|
||||
print('All subprocesses done.')
|
||||
|
||||
if __name__=="__main__":
|
||||
raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset")
|
||||
output_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset_visual")
|
||||
main(raw_dataset_folder/"test",output_folder/"test",)
|
||||
main(raw_dataset_folder/"training",output_folder/"training",)
|
||||
main(raw_dataset_folder/"validation",output_folder/"validation",)
|
276
src/scripts/rawbindata2rawdataset.py
Normal file
276
src/scripts/rawbindata2rawdataset.py
Normal file
@ -0,0 +1,276 @@
|
||||
'''
|
||||
@File : pre_process.py
|
||||
@Time : 2024/08/09 13:54:28
|
||||
@Author : Zhanpeng Yang
|
||||
@Version : 0.0.1
|
||||
@Contact : zhpyang@outlook.com
|
||||
@Desc : 进行数据预处理
|
||||
'''
|
||||
|
||||
import pandas as pd
|
||||
import logging
|
||||
import os,glob
|
||||
import numpy as np
|
||||
import datetime
|
||||
import pickle
|
||||
import h5py
|
||||
|
||||
class DataPreProcess:
|
||||
"""
|
||||
包含所有数据预处理操作,包括
|
||||
1. 从原始二进制数据,以钢厂 提供的Excel文件中提取出目标时刻附近的光谱
|
||||
2. 从目标时刻附近的光谱选择合适、稳定的时间及空间点的光谱,作为feature
|
||||
3. 将feature 与 Label进行放缩
|
||||
|
||||
"""
|
||||
def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,train_ratio=0.8,validate_ratio=0.1) -> None:
|
||||
"""初始化类,传入所有数据预处理需要的参数
|
||||
|
||||
Args:
|
||||
raw_spectral_data_dir (_type_):原始光谱数据路径
|
||||
raw_labels_dir (_type_): 原始标签路径
|
||||
labels_name (_type_): 提取出的标签名
|
||||
dataset_dir (_type_): 生成的数据集文件夹
|
||||
choose_frame_spatial (_type_): 类对象,进行光谱特征挑选
|
||||
features_scaling (_type_): 类对象,对输入的特征进行放缩,使之直接输入神经网络
|
||||
labels_scaling (_type_): 类对象,对输出的的标签进行放缩
|
||||
train_ratio (float, optional): 训练集比例. Defaults to 0.8.
|
||||
validate_ratio (float, optional): 验证集比例,剩余的即为测试集. Defaults to 0.1.
|
||||
"""
|
||||
self.raw_spectral_data_dir=raw_spectral_data_dir
|
||||
self.raw_labels_dir=raw_labels_dir
|
||||
self.dataset_dir=dataset_dir
|
||||
self.labels_name=labels_name
|
||||
|
||||
self.train_ratio=train_ratio
|
||||
self.validate_ratio=validate_ratio
|
||||
|
||||
#加载原始的标签
|
||||
self.raw_labels=pd.read_excel(raw_labels_dir)
|
||||
|
||||
#加载原始光谱的缓存
|
||||
self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
|
||||
|
||||
#随便加载一个csv文件,判断光谱数据的维度
|
||||
self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
|
||||
|
||||
#正式开始原始数据转化为数据集
|
||||
self._raw_data_2_dataset()
|
||||
|
||||
|
||||
|
||||
# def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
|
||||
# """读取利用脚本处理后的钢厂给的excel文件所在文件夹(会扫描所有文件)
|
||||
# 并选择指定的label名作为output
|
||||
# 并去除值为NaN或0的行
|
||||
|
||||
# Args:
|
||||
# raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
|
||||
# labels_name (list): 指定的作为Label的列
|
||||
|
||||
# Returns:
|
||||
# pd.DataFrame: 返回所有筛选后的炉次数据
|
||||
# """
|
||||
|
||||
# raw_labels=None
|
||||
# for name in os.listdir(raw_labels_dir):
|
||||
# tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
|
||||
|
||||
# choosed_column=["TSC_start_time","TSC_end_time"]
|
||||
# choosed_column=choosed_column+labels_name
|
||||
|
||||
|
||||
# #只选出我们想要的部分作为标签
|
||||
# tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
|
||||
|
||||
# # 选出有NULL的行
|
||||
# null_rows=tmp_raw_labels.isnull().any(axis=1)
|
||||
# # # 选出有0的行
|
||||
# zeros_rows=(tmp_raw_labels==0).any(axis=1) | (tmp_raw_labels=='0').any(axis=1)
|
||||
|
||||
# # # 每一行但凡有NULL或者0都给删了
|
||||
# selected_rows=~(null_rows|zeros_rows)
|
||||
# tmp_raw_labels=tmp_raw_labels[selected_rows]
|
||||
|
||||
# if raw_labels is None:
|
||||
# raw_labels=tmp_raw_labels
|
||||
# else:
|
||||
# raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
|
||||
# logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
|
||||
# logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
|
||||
|
||||
# return raw_labels
|
||||
def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
|
||||
"""生成所有原始光谱数据文件的缓存,包括每个文件记录的开始及结束时间,目的为加快后面读取原始数据的速度
|
||||
|
||||
Args:
|
||||
raw_spectral_data_dir (str): 原始光谱所在路径
|
||||
|
||||
Returns:
|
||||
list: 缓存,其中有多少个成员,就有多少个原始数据文件,每个成员包括格式为datetime的开始及结束时间,及文件路径
|
||||
"""
|
||||
spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
|
||||
cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
|
||||
|
||||
update_flag=False
|
||||
|
||||
|
||||
#不存在缓存文件,以及缓存文件中数据文件个数与文件夹中的文件个数不一致,均重新生成
|
||||
if len(cache_file_paths)==0:
|
||||
logging.debug(f"Raw spectral data cache is not existed! Generating")
|
||||
update_flag=True
|
||||
elif len(cache_file_paths)==1:
|
||||
with open(cache_file_paths[0],"rb") as f:
|
||||
raw_spectral_data_cache=pickle.load(f)
|
||||
if len(raw_spectral_data_cache) !=len(spectral_file_paths):
|
||||
logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
|
||||
update_flag=True
|
||||
else:
|
||||
logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
|
||||
|
||||
if update_flag:
|
||||
spectral_file_paths.sort()
|
||||
raw_spectral_data_cache=[]
|
||||
for file in spectral_file_paths:
|
||||
tmp_info={}
|
||||
tmp_data=np.loadtxt(file, delimiter=",")
|
||||
start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
|
||||
end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
|
||||
tmp_info["start_t"]=start_t
|
||||
tmp_info["end_t"]=end_t
|
||||
tmp_info["file_path"]=file
|
||||
raw_spectral_data_cache.append(tmp_info)
|
||||
|
||||
with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
|
||||
pickle.dump(raw_spectral_data_cache,f)
|
||||
|
||||
return raw_spectral_data_cache
|
||||
def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
|
||||
data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
|
||||
if data[0,2]==229376:
|
||||
spectral_dim =224
|
||||
spatial_dim=512
|
||||
if data[0,2]==917504:
|
||||
spectral_dim =448
|
||||
spatial_dim=1024
|
||||
return spectral_dim, spatial_dim
|
||||
|
||||
def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
|
||||
"""获取从start_time到end_time的光谱数据
|
||||
|
||||
Args:
|
||||
start_time (datetime.datetime): 开始时间
|
||||
end_time (datetime.datetime): 结束时间
|
||||
|
||||
Returns:
|
||||
np.ndarray: 原始光谱数据
|
||||
"""
|
||||
|
||||
|
||||
def get_spectral_data_per_file(file_path,s_t,e_t):
|
||||
data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
|
||||
|
||||
if s_t is not None:
|
||||
tmp_s=datetime.datetime.timestamp(s_t)*1000
|
||||
tmp_s_index=0
|
||||
for i in range(data.shape[0]-1):
|
||||
if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
|
||||
tmp_s_index=i
|
||||
break
|
||||
else:
|
||||
tmp_s_index=0
|
||||
if e_t is not None:
|
||||
tmp_e=datetime.datetime.timestamp(e_t)*1000
|
||||
tmp_e_index=data.shape[0]
|
||||
for i in range(tmp_s_index,data.shape[0]-1):
|
||||
if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
|
||||
tmp_e_index=i
|
||||
break
|
||||
else:
|
||||
tmp_e_index=data.shape[0]
|
||||
|
||||
with open(file_path[:-3]+"bin", "rb") as f:
|
||||
f.seek(data[tmp_s_index,1])
|
||||
d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
|
||||
d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
|
||||
return data[tmp_s_index:tmp_e_index,0],d
|
||||
|
||||
timestamps=None
|
||||
raw_spectral_data=None
|
||||
for tmp_info in self.raw_spectral_data_cache:
|
||||
tmp_data=None
|
||||
if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and end_time<tmp_info["end_t"]:
|
||||
# 目标时间段在交于此文件时间段的左侧。所以取从文件开始到,end time的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
|
||||
|
||||
elif start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"]:
|
||||
# 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
|
||||
elif start_time>tmp_info["start_t"] and end_time<tmp_info["end_t"]:
|
||||
# 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
|
||||
elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and end_time>tmp_info["end_t"]:
|
||||
# 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
|
||||
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
|
||||
if tmp_data is not None:
|
||||
if raw_spectral_data is None:
|
||||
timestamps=tmp_time_stamp
|
||||
raw_spectral_data=tmp_data
|
||||
else:
|
||||
timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
|
||||
raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
|
||||
return timestamps,raw_spectral_data
|
||||
|
||||
def _raw_data_2_dataset(self):
|
||||
|
||||
save_dir=os.path.join(self.dataset_dir)
|
||||
|
||||
#第一步,进行特征时间与空间点选取,并保存
|
||||
|
||||
pre_dataset_save_dir=os.path.join(save_dir,"raw_dataset")
|
||||
|
||||
if not os.path.exists(pre_dataset_save_dir):
|
||||
os.makedirs(pre_dataset_save_dir)
|
||||
|
||||
#数据路径不存在就重新生成,如果存在,就跳过这部分。
|
||||
# !!!!!!!!!!因此需要额外注意,如果有新数据,需要把dataset下的文件夹都删除,相当于删除缓存,重新生成
|
||||
for i in range(self.raw_labels.shape[0]):
|
||||
|
||||
start_time,end_time=self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10),self.raw_labels.iloc[i]["TSC_end_time"]
|
||||
timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
|
||||
|
||||
logging.debug(f"{self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10)},{self.raw_labels.iloc[i]["TSC_end_time"]}")
|
||||
if raw_spectral_data is not None:
|
||||
#获取到的数据帧率大于2才开始记录
|
||||
if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
|
||||
logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
|
||||
|
||||
# raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
|
||||
np.savez(os.path.join(pre_dataset_save_dir,f"{self.raw_labels.iloc[i]["Furnace_Number"]}.npz",),furnaceNumber=self.raw_labels.iloc[i]["Furnace_Number"],measureStartDatetime=self.raw_labels.iloc[i]["TSC_start_time"].strftime('%Y-%m-%d %H:%M:%S'),measureEndDatetime=self.raw_labels.iloc[i]["TSC_end_time"].strftime('%Y-%m-%d %H:%M:%S'),timestamps=timestamps,rawSpectralData=raw_spectral_data,rawLabels=self.raw_labels.iloc[i][self.labels_name].to_numpy(),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"])
|
||||
# else:
|
||||
# logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
|
||||
logging.basicConfig(level = logging.DEBUG)
|
||||
|
||||
|
||||
raw_data_dir="/data/SEMS-model-training/old_rawdata"
|
||||
labels_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx"
|
||||
dataset_dir="/data/SEMS-model-training/dataset"
|
||||
labels_name=["TSC_T","TSC_C","TSC_P","TSC_S","TSC_Mn","TSC_Ni","TSC_Mo","TSC_Cr"]
|
||||
|
||||
|
||||
|
||||
|
||||
# from choose_frame_spatial.mean import ChooseFrameSpatial
|
||||
# from features_scaling.max_min import FeatureScaling
|
||||
# from labels_scaling.max_min import LabelScaling
|
||||
# choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
|
||||
# features_scaling=FeatureScaling()
|
||||
# labels_scaling=LabelScaling()
|
||||
|
||||
|
||||
|
||||
data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir)
|
Loading…
Reference in New Issue
Block a user