diff --git a/.gitignore b/.gitignore index 59af8c0..0ad58a2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ dataset -rawdata +old_rawdata labels logs - diff --git a/CHANGELOG.md b/CHANGELOG.md index bda3c34..bf86a51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Change Log -## v0.1.0.20240910_alpha +## v0.0.2.20241015_alpha -- 跑通 \ No newline at end of file +# [⭐️Features] + +- 从 MySQL 数据库获取数据 + +# [🔄Changed] + +- 变更数据处理流程,先从数据库获取数据划分训练集测试集,得到中间文件,再进行选择空间点等后续处理 + +## v0.0.1.20240910_alpha + +- 跑通 diff --git a/TODO.md b/TODO.md index e69de29..2492718 100644 --- a/TODO.md +++ b/TODO.md @@ -0,0 +1,15 @@ +-[ ] 调研所有可调整超参数 + +-[ ] 调研所有回归任务的 loss 设计 + +-[ ] 调研所有可以用以回归任务的模型架构,及其可调整参数 + +-[ ] 调研跳出局部最优解的方法 + +-[x] 将数据库中的数据转化成训练集,验证集,测试集 + +-[x] 将训练集,验证集用不同的预处理、模型、超参数得到最优的模型 + +-[x] 将最优模型极其对应的超参数保存,并验证测试集得到测试误差 + +-[x] 不同的参数使用不同的模型 diff --git a/configs/main_train.yaml b/configs/main_train.yaml index 148d415..38b7737 100644 --- a/configs/main_train.yaml +++ b/configs/main_train.yaml @@ -5,16 +5,16 @@ defaults: task_name: main -raw_spectral_data_dir: /code/admin/20240806-NanEr-5-8-data/rawdata -raw_labels_dir: /code/admin/20240806-NanEr-5-8-data/labels/NanEr -dataset_dir: /code/admin/20240806-NanEr-5-8-data/dataset -labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"] +# raw_spectral_data_dir: /data/SEMS-model-training/rawdata +# raw_labels_dir: /data/SEMS-model-training/labels/NanEr +dataset_dir: /data/SEMS-model-training/dataset +# labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"] #是 - dataset: - train_ratio: 0.8 - validate_ratio: 0.1 - num_worker: 128 + # train_ratio: 0.8 + # validate_ratio: 0.1 + num_worker: 0 train: - max_epoch: 10000 + max_epoch: 5000 + checkpoint_interval: 100 diff --git a/configs/ray_tune/default.yaml b/configs/ray_tune/default.yaml index 733ef3a..bdd6f19 100644 --- a/configs/ray_tune/default.yaml +++ b/configs/ray_tune/default.yaml @@ -1,40 +1,37 @@ run: num_samples: 1 - resources_per_trial: - cpu: 128 - gpu: 1 + resources_per_trial: + cpu: 4 + gpu: 0.1 scheduler: - _target_: ray.tune.schedulers.ASHAScheduler - metric: val_loss - mode: min - max_t: 20000 - grace_period: 1 - reduction_factor: 2 + _target_: ray.tune.schedulers.FIFOScheduler + # _target_: ray.tune.schedulers.ASHAScheduler + # metric: val_loss + # mode: min + # max_t: 20000 + # grace_period: 1 + # reduction_factor: 2 config: choose_frame_spatial: _target_: ray.tune.grid_search values: - - _target_: data.choose_frame_spatial.mean.ChooseFrameSpatial - interval: [-3,0] - + - _target_: data.choose_frame_spatial.DBSCAN.ChooseFrameSpatial ##DBSCAN_data_augmentation features_scaling: _target_: ray.tune.grid_search values: - - _target_: data.features_scaling.max_min.FeatureScaling - + - _target_: data.features_scaling.standardization.FeatureScaling # labels_scaling: _target_: ray.tune.grid_search values: - - _target_: data.labels_scaling.max_min.LabelScaling + - _target_: data.labels_scaling.standardization.LabelScaling #standardization model: _target_: ray.tune.grid_search values: - # - _target_: model.FCNN.FCNN - # - _target_: model.CNN_LSTM_FCNN.CNN_LSTM_FCNN - - _target_: model.CNN1D_FCNN.CNN1D_FCNN + - _target_: model.DRSN-CW.SpectralModel + # - _target_: model.FCNN_DBSCAN_small.SpectralModel criterion: _target_: ray.tune.grid_search values: @@ -42,10 +39,15 @@ config: optimizer: _target_: ray.tune.grid_search values: + - _target_: torch.optim.Adam + _partial_: true + lr: 0.000001 - _target_: torch.optim.Adam _partial_: true lr: 0.0001 + batch_size: _target_: ray.tune.grid_search values: - - 128 \ No newline at end of file + - 1024 + - 128 diff --git a/src/data/__pycache__/pre_process.cpython-312.pyc b/src/data/__pycache__/pre_process.cpython-312.pyc index abb675b..a9c1c38 100644 Binary files a/src/data/__pycache__/pre_process.cpython-312.pyc and b/src/data/__pycache__/pre_process.cpython-312.pyc differ diff --git a/src/data/__pycache__/pre_process_backup.cpython-312.pyc b/src/data/__pycache__/pre_process_backup.cpython-312.pyc new file mode 100644 index 0000000..b2680cd Binary files /dev/null and b/src/data/__pycache__/pre_process_backup.cpython-312.pyc differ diff --git a/src/data/__pycache__/spectral_dataset.cpython-312.pyc b/src/data/__pycache__/spectral_dataset.cpython-312.pyc index eb54976..5a3313f 100644 Binary files a/src/data/__pycache__/spectral_dataset.cpython-312.pyc and b/src/data/__pycache__/spectral_dataset.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/DBSCAN.py b/src/data/choose_frame_spatial/DBSCAN.py new file mode 100644 index 0000000..c1d65be --- /dev/null +++ b/src/data/choose_frame_spatial/DBSCAN.py @@ -0,0 +1,142 @@ +''' +@File : mean.py +@Time : 2024/08/12 13:53:36 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征 +''' + + +import datetime +import numpy as np +from sklearn.cluster import DBSCAN +from pathlib import Path + +class ChooseFrameSpatial: + def __init__(self,eps=0.15,min_samples=10): + """注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数 + """ + self.eps=eps + self.min_samples=min_samples + self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改 + + self.file_path = Path(__file__).absolute() + + + + def load_state_dict(self,state): + self.eps=state["eps"] + self.min_samples=state["min_samples"] + def state_dict(self): + return {"eps":self.eps,"min_samples":self.min_samples} + + + + def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + """ + 获取指定时间的内的光谱数据 + """ + + if isinstance(measureStartDatetime.item(),str): + start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间(此项目中时间戳均以ms为单位) + end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000 #测量开始时间+3秒 + else: + start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000 + end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000 + + start_index=0 + end_index=timestamps.shape[0] + for i in range(1,timestamps.shape[0]): + if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp: + # print(f"开始索引为{i}") + start_index=i + if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp: + # print(f"结束索引为{i}") + end_index=i + + + return rawdata[start_index:end_index,...] + + + + + def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + + rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata) + + + rawSpectralData=rawSpectralData.transpose(0, 2, 1) + rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2])) + + rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True)) + db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed) + + labels_norm = db_norm.labels_ + n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0) + + + if n_norm==0: + return None + + max_i=0 + max_num=0 + for i in range(n_norm): + tmp=(labels_norm==i).sum() + if tmp>max_num: + max_i=i + max_num=tmp + + + selected_data=rawSpectralData_normed[labels_norm==max_i,:] + selected_data=np.mean(selected_data,axis=0) + + # space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0) + # space_num= 10 #空间上平均光强合适,且方差最小的10个点 + # light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean))) + # & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean))) + # ) + # if light_indices[0].shape[0] < space_num: + # print('No Enough Usable Space Point Found!') + # return None + + # intensity_data = selected_data[:,:,light_indices[0]] + + # # spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0) + # # top_10_indices = np.argsort(spatial_variance)[:space_num] + # space_selected_data = intensity_data[:,:,:] + + # space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0) + + # space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data)) + + # # Savitzky-Golay filter + # data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0) + + # spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11, + # 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41, + # 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31, + # 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140, + # 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47] + + + # selected_data = data_filtered[spec_indices] + + + return selected_data + +if __name__ =="__main__": + raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True) + print(raw_data.keys()) + + print(f"炉次号\t:{raw_data["furnaceNumber"]}") + print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}") + print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}") + print(f"时间戳维度\t:{raw_data["timestamps"].shape}") + print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}") + + tmp=ChooseFrameSpatial() + output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"]) + + + print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}") \ No newline at end of file diff --git a/src/data/choose_frame_spatial/DBSCAN_data_augmentation.py b/src/data/choose_frame_spatial/DBSCAN_data_augmentation.py new file mode 100644 index 0000000..5ea220d --- /dev/null +++ b/src/data/choose_frame_spatial/DBSCAN_data_augmentation.py @@ -0,0 +1,102 @@ +''' +@File : mean.py +@Time : 2024/08/12 13:53:36 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征 +''' + + +import datetime +import numpy as np +from sklearn.cross_decomposition import PLSRegression +from scipy.signal import savgol_filter +from sklearn.cluster import DBSCAN + +class ChooseFrameSpatial: + def __init__(self,eps=0.15,min_samples=10): + """注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数 + """ + self.eps=eps + self.min_samples=min_samples + self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改 + + + def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + """ + 获取指定时间的内的光谱数据 + """ + + if isinstance(measureStartDatetime.item(),str): + start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间(此项目中时间戳均以ms为单位) + end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000 #测量开始时间+3秒 + else: + start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000 + end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000 + + start_index=0 + end_index=timestamps.shape[0] + for i in range(1,timestamps.shape[0]): + if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp: + # print(f"开始索引为{i}") + start_index=i + if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp: + # print(f"结束索引为{i}") + end_index=i + + + return rawdata[start_index:end_index,...] + + + + + def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + + rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata) + + + rawSpectralData=rawSpectralData.transpose(0, 2, 1) + rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2])) + + rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True)) + db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed) + + labels_norm = db_norm.labels_ + n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0) + + + if n_norm==0: + return None + + max_i=0 + max_num=0 + for i in range(n_norm): + tmp=(labels_norm==i).sum() + if tmp>max_num: + max_i=i + max_num=tmp + + + selected_data=rawSpectralData_normed[labels_norm==max_i,:] + # selected_data=np.mean(selected_data,axis=0) + + + + return selected_data + +if __name__ =="__main__": + raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True) + print(raw_data.keys()) + + print(f"炉次号\t:{raw_data["furnaceNumber"]}") + print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}") + print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}") + print(f"时间戳维度\t:{raw_data["timestamps"].shape}") + print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}") + + tmp=ChooseFrameSpatial() + output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"]) + + + print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}") \ No newline at end of file diff --git a/src/data/choose_frame_spatial/__pycache__/DBSCAN.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/DBSCAN.cpython-312.pyc new file mode 100644 index 0000000..2cfaf00 Binary files /dev/null and b/src/data/choose_frame_spatial/__pycache__/DBSCAN.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/__pycache__/DBSCAN_data_augmentation.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/DBSCAN_data_augmentation.cpython-312.pyc new file mode 100644 index 0000000..00e1d72 Binary files /dev/null and b/src/data/choose_frame_spatial/__pycache__/DBSCAN_data_augmentation.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/__pycache__/FCNN_CARS.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/FCNN_CARS.cpython-312.pyc new file mode 100644 index 0000000..011b4f8 Binary files /dev/null and b/src/data/choose_frame_spatial/__pycache__/FCNN_CARS.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/__pycache__/mean.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/mean.cpython-312.pyc index 79af4ab..c2fed23 100644 Binary files a/src/data/choose_frame_spatial/__pycache__/mean.cpython-312.pyc and b/src/data/choose_frame_spatial/__pycache__/mean.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/__pycache__/mean_CARS100_3.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/mean_CARS100_3.cpython-312.pyc new file mode 100644 index 0000000..79e33b2 Binary files /dev/null and b/src/data/choose_frame_spatial/__pycache__/mean_CARS100_3.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/__pycache__/mean_chose_top_10_spatial.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/mean_chose_top_10_spatial.cpython-312.pyc new file mode 100644 index 0000000..ea29499 Binary files /dev/null and b/src/data/choose_frame_spatial/__pycache__/mean_chose_top_10_spatial.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/__pycache__/meanwovar.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/meanwovar.cpython-312.pyc new file mode 100644 index 0000000..7275cb2 Binary files /dev/null and b/src/data/choose_frame_spatial/__pycache__/meanwovar.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/__pycache__/meanwovar_CARS.cpython-312.pyc b/src/data/choose_frame_spatial/__pycache__/meanwovar_CARS.cpython-312.pyc new file mode 100644 index 0000000..6cf1994 Binary files /dev/null and b/src/data/choose_frame_spatial/__pycache__/meanwovar_CARS.cpython-312.pyc differ diff --git a/src/data/choose_frame_spatial/mean.py b/src/data/choose_frame_spatial/mean.py deleted file mode 100644 index 64f2d3e..0000000 --- a/src/data/choose_frame_spatial/mean.py +++ /dev/null @@ -1,100 +0,0 @@ -''' -@File : mean.py -@Time : 2024/08/12 13:53:36 -@Author : Zhanpeng Yang -@Version : 0.0.1 -@Contact : zhpyang@outlook.com -@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征 -''' - - - -import datetime -import numpy as np -class ChooseFrameSpatial: - def __init__(self, interval=[-3,0]): - """注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数 - - Args: - interval (list, optional): 此方法依赖的光谱时间范围,默认[-30,30],即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入. - """ - self.description=f"{str(self.__class__).split(".")[2]}_{interval[0]}_{interval[1]}" - # print(self.description) - self.interval=interval - - def get_data_interval(self, start_time, end_time ): - - return start_time+datetime.timedelta(seconds=self.interval[0]),start_time+datetime.timedelta(seconds=self.interval[1]) - - def run(self,timestamps,rawdata): - ############# 空间数据压缩 ###################### - space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0) - - space_num= 10 #选取方差最小的点的个数 - - light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean))) - & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean))) - ) - - if light_indices[0].shape[0] < space_num: - print('No Enough Usable Space Point Found!') - return None - - intensity_data = rawdata[:,:,light_indices[0]] - - spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0) - top_10_indices = np.argsort(spatial_variance)[:space_num] - space_selected_data = intensity_data[:, :, top_10_indices] - - ############# 时间数据压缩 ###################### - # framerate = 24 - # timewindow = 2 - - # time_data_intensity = np.sum(np.mean(space_selected_data,axis=2),axis=1) - - # min_var = float('inf') - # min_index = 0 - - # for i in range(len(time_data_intensity)-framerate*timewindow-1): - # window_var = np.var(time_data_intensity[i:i+framerate*timewindow]) - # if window_var < min_var: - # min_var = window_var - # min_index = i - - # selected_data = space_selected_data[min_index:min_index+framerate*timewindow,:,:] - - selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1) - # print("timewindow_begin=",min_index) - - # if (result_dir is not None) and (filenum is not None) : - - # for i in range (selected_data.shape[2]): - - # Z=selected_data[:,:,i] - # x=np.linspace(400,1000,Z.shape[1]) - # y=selected_data[:,1,i] - - # x, y = np.meshgrid(x, y) - - # fig, ax = plt.subplots(subplot_kw=dict(projection='3d')) - - # surf = ax.plot_surface(x, y, Z, cmap=cm.Blues, - # linewidth=0, antialiased=False) - - # ax.view_init(elev=30, azim=-60) # 更改视角 - - # ax.set_zlim(0, 4095) - # ax.set_zlabel("Light Intensity") - # ax.set_xlabel("Spectral Band(nm)") - # ax.set_ylabel("Time (Serial Number)") - # plt.savefig(f"{result_dir}{ filenum} file {i}-th spatial_point_line.png") - # plt.close() - - return selected_data - -if __name__ =="__main__": - timpestamp=np.zeros((600,)) - input_data=np.random.random((600,224,512))*4095 - tmp=ChooseFrameSpatial() - output=tmp.run(timpestamp,input_data) - print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}") \ No newline at end of file diff --git a/src/data/choose_frame_spatial/mean_CARS100_3.py b/src/data/choose_frame_spatial/mean_CARS100_3.py new file mode 100644 index 0000000..64bfb58 --- /dev/null +++ b/src/data/choose_frame_spatial/mean_CARS100_3.py @@ -0,0 +1,125 @@ +''' +@File : mean.py +@Time : 2024/08/12 13:53:36 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征 +''' + + + +import datetime +import numpy as np +from sklearn.cross_decomposition import PLSRegression +from scipy.signal import savgol_filter +from scipy.ndimage import median_filter + +class ChooseFrameSpatial: + def __init__(self): + """注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数 + """ + + self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改 + + print(self.description) + + def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + """ + 获取指定时间的内的光谱数据 + """ + + + if isinstance(measureStartDatetime.item(),str): + start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 #测量开始时间(此项目中时间戳均以ms为单位) + end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 3000 #测量开始时间+3秒 + else: + start_timestamp=measureStartDatetime.item().timestamp()*1000 + end_timestamp=measureStartDatetime.item().timestamp()*1000 + 3000 + + start_index=0 + end_index=timestamps.shape[0] + for i in range(1,timestamps.shape[0]): + if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp: + # print(f"开始索引为{i}") + start_index=i + if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp: + # print(f"结束索引为{i}") + end_index=i + + + return rawdata[start_index:end_index,...] + + + + + def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + + selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata) + + + + space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0) + space_num= 10 #空间上平均光强合适,且方差最小的10个点 + light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean))) + & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean))) + ) + if light_indices[0].shape[0] < space_num: + print('No Enough Usable Space Point Found!') + return None + + intensity_data = selected_data[:,:,light_indices[0]] + + # spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0) + # top_10_indices = np.argsort(spatial_variance)[:space_num] + + time_window = 11 + time_steps, wavelengths, spatial_points = intensity_data.shape + filtered_data = np.zeros_like(intensity_data) + for wavelength in range(wavelengths): + for spatial_point in range(spatial_points): + # 提取时间序列 + time_series = intensity_data[:, wavelength, spatial_point] + # 应用中值滤波 + filtered_time_series = median_filter(time_series, size=time_window) + # 将滤波后的时间序列放回原位置 + filtered_data[:, wavelength, spatial_point] = filtered_time_series + + + space_selected_data = filtered_data[:,:,:] + + space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0) + + # space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data)) + + # # Savitzky-Golay filter + # data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0) + + # spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11, + # 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41, + # 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31, + # 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140, + # 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47] + + + # selected_data = data_filtered[spec_indices] + + # print(space_selected_data.shape) + return space_selected_data + # return selected_data + +if __name__ =="__main__": + raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True) + print(raw_data.keys()) + + print(f"炉次号\t:{raw_data["furnaceNumber"]}") + print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}") + print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}") + print(f"时间戳维度\t:{raw_data["timestamps"].shape}") + print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}") + + tmp=ChooseFrameSpatial() + output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"]) + + + print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}") \ No newline at end of file diff --git a/src/data/choose_frame_spatial/mean_chose_top_10_spatial.py b/src/data/choose_frame_spatial/mean_chose_top_10_spatial.py new file mode 100644 index 0000000..86136a9 --- /dev/null +++ b/src/data/choose_frame_spatial/mean_chose_top_10_spatial.py @@ -0,0 +1,83 @@ +''' +@File : mean.py +@Time : 2024/08/12 13:53:36 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征 +''' + + + +import datetime +import numpy as np + +class ChooseFrameSpatial: + def __init__(self,): + """注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数 + """ + + + self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改 + + print(self.description) + + def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + """ + 获取指定时间的内的光谱数据 + """ + start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000 #测量开始时间-1秒(此项目中时间戳均以ms为单位) + end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 +1000 #测量开始时间+1秒 + + for i in range(1,timestamps.shape[0]): + if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp: + # print(f"开始索引为{i}") + start_index=i + if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp: + # print(f"结束索引为{i}") + end_index=i + + return rawdata[start_index:end_index,...] + + def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + + selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata) + + + + + space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0) + space_num= 10 #空间上平均光强合适,且方差最小的10个点 + light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean))) + & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean))) + ) + if light_indices[0].shape[0] < space_num: + print('No Enough Usable Space Point Found!') + return None + + intensity_data = selected_data[:,:,light_indices[0]] + + spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0) + top_10_indices = np.argsort(spatial_variance)[:space_num] + selected_data = intensity_data[:, :, top_10_indices] + + + + + return selected_data + +if __name__ =="__main__": + raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True) + print(raw_data.keys()) + + print(f"炉次号\t:{raw_data["furnaceNumber"]}") + print(f"开始测量时间\t:{raw_data["measureStartDatetime"]}") + print(f"结束测量时间\t:{raw_data["measureEndDatetime"]}") + print(f"时间戳维度\t:{raw_data["timestamps"].shape}") + print(f"原始光谱数据维度\t:{raw_data["rawSpectralData"].shape}") + + tmp=ChooseFrameSpatial() + output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"]) + + + print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}") \ No newline at end of file diff --git a/src/data/choose_frame_spatial/mean_var_weight_reduction.py b/src/data/choose_frame_spatial/mean_var_weight_reduction.py deleted file mode 100644 index f27edc4..0000000 --- a/src/data/choose_frame_spatial/mean_var_weight_reduction.py +++ /dev/null @@ -1,132 +0,0 @@ -''' -@File : mean.py -@Time : 2024/08/12 13:53:36 -@Author : Zhanpeng Yang -@Version : 0.0.1 -@Contact : zhpyang@outlook.com -@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征 -''' - - - - -import datetime -import numpy as np -class ChooseFrameSpatial: - def __init__(self, interval=[-30,30],overexposion_threshold = 1,weight_mean_var = 0.5,space_num= 10, time_num = 9,framerate = 4,timewindow_time = 1,Type=6): - """注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数 - - Args: - interval (list, optional): 此方法依赖的光谱时间范围,默认[-30,30],即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入. - """ - # print(str(self.__class__).split(".")) - self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}" - - # print(self.description) - - self.overexposion_threshold = overexposion_threshold #过曝阈值:有多少时间点过曝时,判定为这个空间点过曝 - self.weight_mean_var =weight_mean_var #强度/方差选取权重,默认平均光强权重为1,此权重为方差权重,取值为[0,∞) - self.space_num= space_num #选取空间点数量 - self.time_num = time_num #选取时间点数量 - self.framerate = framerate #采样帧率 - self.timewindow = framerate * timewindow_time #选取时间窗口为1s - - - self.interval=interval - self.Type=Type - - def get_data_interval(self, start_time, end_time ): - - return start_time+datetime.timedelta(seconds=self.interval[0]),end_time+datetime.timedelta(seconds=self.interval[1]) - - def Overexposed_spatial_point_detection(self,inputdata, timethreshold): - #判别一个空间点是否过曝,inputdata是(time,wavelength)二维数组,timethreshold是时间阈值,有多少个时间点过曝则认为这个点过曝 - # 如果过曝返回False,不过曝返回True - - row_max_values= np.max(inputdata, axis=1) - overexposed_rows = np.sum(row_max_values >= 4095) - if overexposed_rows > timethreshold: - return False - else: - return True - def run(self,timestamps,rawdata): - # 降维方法:空间上选取强度达到一定的阈值且方差最小的点,时间上把整个时间段划分并选取各段权重最高的1s进行平均,Type=1取前1/3,Type=2取中部1/3,Type=3取最后1/3,Type=4取全部 - - - ############# 空间过曝点去除 ###################### - unoverposed_indices = [i for i in range(rawdata.shape[2]) if self.Overexposed_spatial_point_detection(rawdata[:, :, i],self.overexposion_threshold)] - rawdata = rawdata[:,:,unoverposed_indices] - - ############# 空间数据压缩 ###################### - space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0) - space_intensity_var = np.var(np.sum(rawdata, axis=1), axis=0) - combined_score = (space_intensity_mean - np.min(space_intensity_mean)) / (np.max(space_intensity_mean) - np.min(space_intensity_mean)) \ - - self.weight_mean_var * (space_intensity_var - np.min(space_intensity_var)) / (np.max(space_intensity_var) - np.min(space_intensity_var)) - - sorted_indices = np.argsort(combined_score)[::-1] - space_selected_data = rawdata[:,:,sorted_indices[:self.space_num]] - - ############# 时间数据压缩 ###################### - space_selected_data_intensity = np.sum(space_selected_data,axis=1) - - #按照时间把强度数据拆分成num_time份,每一份维度为(总时间点数/time_num, space_num) - time_length = space_selected_data_intensity.shape[0] - chunk_size = time_length // self.time_num - remainder = time_length % self.time_num - start_index = 0 - - time_selected_data =None - - for i in range(self.time_num): - end_index = start_index + chunk_size + (1 if i < remainder else 0) - chunk = space_selected_data_intensity[start_index:end_index, :] - - window_var = np.empty(0) - window_mean = np.empty(0) - - for j in range(len(chunk)-self.timewindow-1): - window_var = np.concatenate((window_var, np.expand_dims( np.sum(np.var(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0) - window_mean = np.concatenate((window_mean, np.expand_dims( np.sum(np.mean(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0) - - combined_score_time = (window_mean - np.min(window_mean)) / (np.max(window_mean) - np.min(window_mean)) \ - - self.weight_mean_var * (window_var - np.min(window_var)) / (np.max(window_var) - np.min(window_var)) - sorted_indices = np.argsort(combined_score_time)[::-1] - - time_window_data = np.mean(space_selected_data[start_index+sorted_indices[0]:start_index+sorted_indices[0]+self.timewindow, :, :],axis=0) - # print(time_selected_data.shape,time_window_data.shape,np.expand_dims( time_window_data,0).shape) - if time_selected_data is None: - - time_selected_data=np.expand_dims( time_window_data,0) - else: - time_selected_data = np.concatenate((time_selected_data, np.expand_dims( time_window_data,0)), axis=0) - - start_index = end_index - - if self.Type == 1: - print("time_selected_data[前1/3].shape: ", time_selected_data[:self.time_num//3,:,:].shape) - return time_selected_data[:self.time_num//3,:,:] - elif self.Type == 2: - print("time_selected_data[中1/3].shape: ", time_selected_data[self.time_num//3:2*self.time_num//3,:,:].shape) - return time_selected_data[self.time_num//3:2*self.time_num//3,:,:] - elif self.Type == 3: - print("time_selected_data[后1/3].shape: ", time_selected_data[2*self.time_num//3:,:,:].shape) - return time_selected_data[2*self.time_num//3:,:,:] - elif self.Type == 4: - print("time_selected_data[前2/3].shape: ", time_selected_data[:2*self.time_num//3,:,:].shape) - return time_selected_data[:2*self.time_num//3,:,:] - elif self.Type == 5: - print("time_selected_data[后2/3].shape: ", time_selected_data[self.time_num//3:,:,:].shape) - return time_selected_data[self.time_num//3:,:,:] - elif self.Type == 6: - print("time_selected_data.shape: ", time_selected_data.shape) - return time_selected_data - else: - print("Type is not 1, 2, 3, 4, 5 or 6 !") - return None - -if __name__ =="__main__": - timpestamp=np.zeros((600,)) - input_data=np.random.random((600,224,512))*4095 - tmp=ChooseFrameSpatial() - output=tmp.run(timpestamp,input_data) - print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}") \ No newline at end of file diff --git a/src/data/choose_frame_spatial/meanwovar_CARS.py b/src/data/choose_frame_spatial/meanwovar_CARS.py new file mode 100644 index 0000000..b72bcce --- /dev/null +++ b/src/data/choose_frame_spatial/meanwovar_CARS.py @@ -0,0 +1,78 @@ +''' +@File : mean.py +@Time : 2024/08/12 13:53:36 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 数据第一阶段预处理,从连续光谱数据中选出合适的光谱作为input的特征 +''' + + + +import datetime +import numpy as np +from scipy.signal import savgol_filter + +class ChooseFrameSpatial: + def __init__(self,): + """注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数 + + Args: + interval (list, optional): 此方法依赖的光谱时间范围,默认[-30,30],即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入. + """ + + + + # tmp= + # if len(tmp)>2: + # self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}" + self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" + + print(self.description) + # print(self.description) + + + + def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata): + ############# 空间数据压缩 ###################### + space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0) + + space_num= 10 #选取方差最小的点的个数 + + light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean))) + & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean))) + ) + + if light_indices[0].shape[0] < space_num: + print('No Enough Usable Space Point Found!') + return None + + intensity_data = rawdata[:,:,light_indices[0]] + + spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0) + top_10_indices = np.argsort(spatial_variance)[:space_num] + space_selected_data = intensity_data[:, :, top_10_indices] + + + selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1) + + selected_data = savgol_filter(selected_data, window_length=11, polyorder=2, axis=0) + + top_features = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, + 11, 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, + 85, 104, 41, 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, + 17, 96, 167, 192, 201, 31, 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, + 147, 58, 182, 49, 119, 13, 179, 140, 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, + 38, 12, 197, 173, 160, 131, 141, 37, 208, 47] + selected_data=selected_data[top_features] + + + + return selected_data + +if __name__ =="__main__": + timpestamp=np.zeros((600,)) + input_data=np.random.random((56,224,512))*4095 + tmp=ChooseFrameSpatial() + output=tmp.run(0,0,timpestamp,input_data) + print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}") \ No newline at end of file diff --git a/src/data/features_scaling/__pycache__/do_nothing.cpython-312.pyc b/src/data/features_scaling/__pycache__/do_nothing.cpython-312.pyc new file mode 100644 index 0000000..09e3c62 Binary files /dev/null and b/src/data/features_scaling/__pycache__/do_nothing.cpython-312.pyc differ diff --git a/src/data/features_scaling/__pycache__/frame_max_min.cpython-312.pyc b/src/data/features_scaling/__pycache__/frame_max_min.cpython-312.pyc new file mode 100644 index 0000000..76cd4f3 Binary files /dev/null and b/src/data/features_scaling/__pycache__/frame_max_min.cpython-312.pyc differ diff --git a/src/data/features_scaling/__pycache__/global_max_min.cpython-312.pyc b/src/data/features_scaling/__pycache__/global_max_min.cpython-312.pyc new file mode 100644 index 0000000..dcf9746 Binary files /dev/null and b/src/data/features_scaling/__pycache__/global_max_min.cpython-312.pyc differ diff --git a/src/data/features_scaling/__pycache__/standardization.cpython-312.pyc b/src/data/features_scaling/__pycache__/standardization.cpython-312.pyc new file mode 100644 index 0000000..84f3590 Binary files /dev/null and b/src/data/features_scaling/__pycache__/standardization.cpython-312.pyc differ diff --git a/src/data/features_scaling/do_nothing.py b/src/data/features_scaling/do_nothing.py new file mode 100644 index 0000000..2620cf3 --- /dev/null +++ b/src/data/features_scaling/do_nothing.py @@ -0,0 +1,21 @@ +''' +@File : max_min.py +@Time : 2024/08/12 14:02:59 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 对输入特征进行标准化放缩 +''' + + + +import numpy as np +class FeatureScaling: + def __init__(self): + self.description=f"{str(self.__class__).split(".")[-2]}" + self.flag_get_global_info=False + + + def run( self,feature): + + return feature \ No newline at end of file diff --git a/src/data/features_scaling/frame_max_min.py b/src/data/features_scaling/frame_max_min.py new file mode 100644 index 0000000..4af2fca --- /dev/null +++ b/src/data/features_scaling/frame_max_min.py @@ -0,0 +1,23 @@ +''' +@File : max_min.py +@Time : 2024/08/12 14:02:59 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 对输入特征进行标准化放缩 +''' + + + +import numpy as np +class FeatureScaling: + def __init__(self): + self.description=f"{str(self.__class__).split(".")[-2]}" + self.flag_get_global_info=False + + + + def run( self,feature): + + feature=(feature-feature.min())/(feature.max()-feature.min()) + return feature \ No newline at end of file diff --git a/src/data/features_scaling/max_min.py b/src/data/features_scaling/global_max_min.py similarity index 100% rename from src/data/features_scaling/max_min.py rename to src/data/features_scaling/global_max_min.py diff --git a/src/data/features_scaling/standardization.py b/src/data/features_scaling/standardization.py new file mode 100644 index 0000000..73864e3 --- /dev/null +++ b/src/data/features_scaling/standardization.py @@ -0,0 +1,72 @@ +''' +@File : max_min.py +@Time : 2024/08/12 14:03:38 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 对输出标签进行标准化放缩 +''' + + + +import numpy as np +from pathlib import Path +class FeatureScaling: + def __init__(self): + self.description=f"{str(self.__class__).split(".")[-2]}" + self.flag_get_global_info=True + self._min=None + self._max=None + self._mean=None + self._var=None + self._num=0 + + self.file_path = Path(__file__).absolute() + + + + def load_state_dict(self,state): + self._min=state["_min"] + self._max=state["_max"] + self._mean=state["_mean"] + self._var=state["_var"] + self._num=state["_num"] + def state_dict(self): + return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num} + + def get_global_info(self,feature): + feature=feature.astype(np.float32) + if self._max is None: + self._min=np.zeros(feature.shape,dtype=np.float32)+999999999 + self._max=np.zeros(feature.shape,dtype=np.float32) + self._mean=np.zeros(feature.shape,dtype=np.float32) + self._var=np.zeros(feature.shape,dtype=np.float32) + self._num=0 + self.feature_shape=feature.shape + + self._min=np.minimum(self._min,feature) + self._max=np.maximum(self._max,feature) + + if self._num>0: + self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(feature-self._mean)*(feature-self._mean) + #要先更新var再更新mean + self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*feature + self._num =self._num+1 + + # print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max) + + + def run(self,feature): + # for i in range(label.shape[0]): + # label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i]) + feature=(feature-self._mean)/np.sqrt(self._var) + + return feature + # def reverse(self,feature): + # # for i in range(labels.shape[1]): + # # labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i] + + # labels=labels*np.sqrt(self._var)+self._mean + # return labels + + \ No newline at end of file diff --git a/src/data/generate_raw_dataset_from_MySQL.py b/src/data/generate_raw_dataset_from_MySQL.py new file mode 100644 index 0000000..9d61979 --- /dev/null +++ b/src/data/generate_raw_dataset_from_MySQL.py @@ -0,0 +1,202 @@ +import os +import shutil +from pathlib import Path +import pymysql.cursors + +import numpy as np +import gzip +import msgpack + +# oldRawDatasetDir= + +def getLatestData(dataNum,deviceId='DOJHBG',measureType="TSC"): + connection = pymysql.connect(host='localhost', + user='root', + password='Ghs@2211', + database='SEMS', + cursorclass=pymysql.cursors.DictCursor) + with connection: + with connection.cursor() as cursor: + # 定义 SQL 查询 + sql = """ + SELECT + f.furnaceNumber, + f.measureStartDatetime, + f.measureEndDatetime, + f.Temperature, + f.C, + f.P, + f.S, + f.Mn, + f.Ni, + f.Mo, + f.Cr, + ot.spectralData, + ot.spectralDim, + ot.spatialDim + FROM + furnace f + JOIN + online_test ot ON f.furnaceId = ot.furnaceId + WHERE + f.deviceId = %s AND + f.measureType = %s + ORDER BY + f.furnaceID DESC + LIMIT %s; + """ + values=(deviceId,measureType,dataNum) + cursor.execute(sql,values) + result = cursor.fetchall() + return result + + + + + +def getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC"): + connection = pymysql.connect(host='localhost', + user='root', + password='Ghs@2211', + database='SEMS', + cursorclass=pymysql.cursors.DictCursor) + dataNum=0 + with connection: + with connection.cursor() as cursor: + # 定义 SQL 查询 + sql = """ + SELECT + COUNT(*) + FROM + furnace as f + WHERE + f.deviceId = %s AND + f.measureType = %s + """ + values=(deviceId,measureType) + + cursor.execute(sql,values) + result = cursor.fetchall() + dataNum=result[0]["COUNT(*)"] + + sql = """ + SELECT furnaceNumber + FROM furnace as f + WHERE + f.deviceId = %s AND + f.measureType = %s + ORDER BY furnaceId DESC + LIMIT 1; + """ + values=(deviceId,measureType) + + cursor.execute(sql,values) + result = cursor.fetchall() + lastesFurnaceNumber=result[0]["furnaceNumber"] + return dataNum,lastesFurnaceNumber + + +def saveDataset(caches, datasetDir,datasetType="train"): + os.makedirs(datasetDir/datasetType) + + for cache in caches: + print(type(cache)) + if isinstance(cache,str): + oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset") + shutil.copy(oldRawDatasetDir/cache, datasetDir/datasetType) + if isinstance(cache,dict): + print(cache.keys()) + spectralData=msgpack.unpackb(gzip.decompress(cache["spectralData"])) + # spectralData=msgpack.unpack(gzip.decompress(spectralDatawithTime["buffer"])) + spectralDataNumpy=np.frombuffer(spectralData["buffer"],dtype=np.uint16).reshape(int(len(spectralData["buffer"])/cache["spectralDim"]/cache["spatialDim"]/2),cache["spectralDim"],cache["spatialDim"]) + + np.savez(datasetDir/datasetType/f"{cache["furnaceNumber"]}.npz",furnaceNumber=cache["furnaceNumber"],measureStartDatetime=cache["measureStartDatetime"],measureEndDatetime=cache["measureEndDatetime"],timestamps=spectralData["timestamps"],rawSpectralData=spectralDataNumpy,rawLabels=np.array([cache["Temperature"],cache["C"],cache["P"],cache["S"],cache["Mn"],cache["Ni"],cache["Mo"],cache["Cr"],]),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"]) + + + + # break + + + +def generateRawDataset(datasetDir="/data/SEMS-model-training/dataset/raw_dataset",datasetNum=1000,validationRate=0.2,testRate=0.1): + + datasetDir=Path(datasetDir) + + + oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset") + oldRawDatasetFileNames=os.listdir(oldRawDatasetDir) + oldRawDatasetFileNames.sort() + oldRawDatasetNum=len(oldRawDatasetFileNames) + # latestFurnaceID=oldRawDatasetFileNames[-1] + + DBdataNum,lastesFurnaceNumber=getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC") + + + + + if os.path.exists(datasetDir): + #如果存在数据集,则判断是否为最新 + if os.path.exists(datasetDir/"validation_set"): + #"通过判断文件中的最新炉次号与已存在的数据集中的一不一样 + valNames=os.listdir(datasetDir/"validation_set") + valNames.sort() + latestDatasetFurnaceID=valNames[-1] + if lastesFurnaceNumber==latestDatasetFurnaceID: + "如果数据集已经是最新的,那就直接返回,不进行任何操作" + return None + else: + shutil.rmtree(datasetDir) + else: + shutil.rmtree(datasetDir) + os.makedirs(datasetDir) + + chooseDBdataNum=0 + chooseOLDRawDataNum=0 + if (DBdataNum+oldRawDatasetNum)>datasetNum: + if DBdataNum>=datasetNum: + chooseDBdataNum=datasetNum + chooseOLDRawDataNum=0 + else: + chooseDBdataNum=DBdataNum + chooseOLDRawDataNum=datasetNum-DBdataNum + else: + chooseDBdataNum=DBdataNum + chooseOLDRawDataNum=oldRawDatasetNum + + print(f"旧数据数量:{chooseOLDRawDataNum}, 新数据数量{chooseDBdataNum}") + + if chooseDBdataNum>0: + # + DBDataset=getLatestData(chooseDBdataNum,deviceId='DOJHBG',measureType="TSC") + else: + DBDataset=[] + + rawDatasetCache=oldRawDatasetFileNames[-chooseOLDRawDataNum:]+DBDataset + print(rawDatasetCache[-1].keys()) + datasetNum=len(rawDatasetCache) + valNum=int(datasetNum*validationRate) + testNum=int(datasetNum*testRate) + trainNum=datasetNum-valNum-testNum + + saveDataset(rawDatasetCache[:trainNum], datasetDir,datasetType="training") + saveDataset(rawDatasetCache[-testNum-valNum:-testNum], datasetDir,datasetType="validation") + + saveDataset(rawDatasetCache[-testNum:], datasetDir,datasetType="test") + + + + + + + + + +if __name__ =="__main__": + tmp=generateRawDataset() + # getMetadataFromMySQL() + + # getLatestData(dataNum=1) + + + + diff --git a/src/data/labels_scaling/__pycache__/max_min.cpython-312.pyc b/src/data/labels_scaling/__pycache__/max_min.cpython-312.pyc index 100ec28..3e6a76e 100644 Binary files a/src/data/labels_scaling/__pycache__/max_min.cpython-312.pyc and b/src/data/labels_scaling/__pycache__/max_min.cpython-312.pyc differ diff --git a/src/data/labels_scaling/__pycache__/standardization.cpython-312.pyc b/src/data/labels_scaling/__pycache__/standardization.cpython-312.pyc new file mode 100644 index 0000000..98e6eb9 Binary files /dev/null and b/src/data/labels_scaling/__pycache__/standardization.cpython-312.pyc differ diff --git a/src/data/labels_scaling/max_min.py b/src/data/labels_scaling/max_min.py index 181bd4b..48be106 100644 --- a/src/data/labels_scaling/max_min.py +++ b/src/data/labels_scaling/max_min.py @@ -33,6 +33,7 @@ class LabelScaling: self._min[i]=label[i] def run(self,label): + for i in range(label.shape[0]): label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i]) return label @@ -41,5 +42,4 @@ class LabelScaling: labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i] return labels - \ No newline at end of file diff --git a/src/data/labels_scaling/standardization.py b/src/data/labels_scaling/standardization.py new file mode 100644 index 0000000..260a005 --- /dev/null +++ b/src/data/labels_scaling/standardization.py @@ -0,0 +1,75 @@ +''' +@File : max_min.py +@Time : 2024/08/12 14:03:38 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 对输出标签进行标准化放缩 +''' + + + +import numpy as np +from pathlib import Path +class LabelScaling: + def __init__(self): + self.description=f"{str(self.__class__).split(".")[-2]}" + self.flag_get_global_info=True + self._min=None + self._max=None + self._mean=None + self._var=None + self._num=0 + + self.file_path = Path(__file__).absolute() + + + + def load_state_dict(self,state): + self._min=state["_min"] + self._max=state["_max"] + self._mean=state["_mean"] + self._var=state["_var"] + self._num=state["_num"] + def state_dict(self): + return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num} + + + def get_global_info(self,label): + label=label.astype(np.float32) + label_dim=label.shape[0] + if self._max is None: + self._min=np.zeros((label_dim),dtype=np.float32)+999999999 + self._max=np.zeros((label_dim),dtype=np.float32) + self._mean=np.zeros((label_dim),dtype=np.float32) + self._var=np.zeros((label_dim),dtype=np.float32) + self._num=0 + + self._min=np.minimum(self._min,label) + self._max=np.maximum(self._max,label) + + if self._num>0: + self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(label-self._mean)*(label-self._mean) + #要先更新var再更新mean + self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*label + self._num =self._num+1 + + # print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max) + # print("_var:",self._var) + + + + def run(self,label): + # for i in range(label.shape[0]): + # label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i]) + label=(label-self._mean)/np.sqrt(self._var) + + return label + def reverse(self,labels): + # for i in range(labels.shape[1]): + # labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i] + + labels=labels*np.sqrt(self._var)+self._mean + return labels + + \ No newline at end of file diff --git a/src/data/pre_process.py b/src/data/pre_process.py index 28287bf..95fea00 100644 --- a/src/data/pre_process.py +++ b/src/data/pre_process.py @@ -14,6 +14,8 @@ import numpy as np import datetime import pickle import h5py +from pathlib import Path +from tqdm import tqdm class DataPreProcess: """ @@ -23,274 +25,85 @@ class DataPreProcess: 3. 将feature 与 Label进行放缩 """ - def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,choose_frame_spatial, features_scaling,labels_scaling,train_ratio=0.8,validate_ratio=0.1) -> None: + def __init__(self, dataset_dir,choose_frame_spatial, features_scaling,labels_scaling) -> None: """初始化类,传入所有数据预处理需要的参数 - - Args: - raw_spectral_data_dir (_type_):原始光谱数据路径 - raw_labels_dir (_type_): 原始标签路径 - labels_name (_type_): 提取出的标签名 - dataset_dir (_type_): 生成的数据集文件夹 - choose_frame_spatial (_type_): 类对象,进行光谱特征挑选 - features_scaling (_type_): 类对象,对输入的特征进行放缩,使之直接输入神经网络 - labels_scaling (_type_): 类对象,对输出的的标签进行放缩 - train_ratio (float, optional): 训练集比例. Defaults to 0.8. - validate_ratio (float, optional): 验证集比例,剩余的即为测试集. Defaults to 0.1. """ - self.raw_spectral_data_dir=raw_spectral_data_dir - self.raw_labels_dir=raw_labels_dir - self.dataset_dir=dataset_dir - self.labels_name=labels_name + + self.dataset_dir=Path(dataset_dir) + self.choose_frame_spatial=choose_frame_spatial self.features_scaling=features_scaling self.labels_scaling=labels_scaling - self.train_ratio=train_ratio - self.validate_ratio=validate_ratio - - #加载原始的标签 - self.raw_labels=self._load_raw_labels(raw_labels_dir,labels_name) - - #加载原始光谱的缓存 - self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir) - - #随便加载一个csv文件,判断光谱数据的维度 - self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache) - - #正式开始原始数据转化为数据集 - self._raw_data_2_dataset() - - - - def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame: - """读取利用脚本处理后的钢厂给的excel文件所在文件夹(会扫描所有文件) - 并选择指定的label名作为output - 并去除值为NaN或0的行 - - Args: - raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径 - labels_name (list): 指定的作为Label的列 - - Returns: - pd.DataFrame: 返回所有筛选后的炉次数据 - """ - - raw_labels=None - for name in os.listdir(raw_labels_dir): - tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name)) - - choosed_column=["TSC_start_time","TSC_end_time"] - choosed_column=choosed_column+labels_name + self.labelNames=None + # _raw_data_2_dataset(self) + self.run() - #只选出我们想要的部分作为标签 - tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column] + def _raw_data_2_pre_dataset(self,pre_dataset_save_dir,dataset_type="test"): - # 选出有NULL的行 - null_rows=tmp_raw_labels.isnull().any(axis=1) - # # 选出有0的行 - zeros_rows=(tmp_raw_labels==0).any(axis=1) | (tmp_raw_labels=='0').any(axis=1) + rawdata_names= os.listdir(self.dataset_dir/"raw_dataset"/dataset_type) + if not os.path.exists(pre_dataset_save_dir/dataset_type): + os.makedirs(pre_dataset_save_dir/dataset_type) + print(f"[预处理]: {dataset_type}数据集选择特征时间与空间点 ") + for name in tqdm(rawdata_names): + + tmp_data=np.load(self.dataset_dir/"raw_dataset"/dataset_type/name,allow_pickle=True) - # # 每一行但凡有NULL或者0都给删了 - selected_rows=~(null_rows|zeros_rows) - tmp_raw_labels=tmp_raw_labels[selected_rows] + raw_spectral_data=self.choose_frame_spatial.run(tmp_data["measureStartDatetime"],tmp_data["measureEndDatetime"],tmp_data["timestamps"],tmp_data["rawSpectralData"]) - if raw_labels is None: - raw_labels=tmp_raw_labels - else: - raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0) - logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces") - logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total") - - return raw_labels - def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list: - """生成所有原始光谱数据文件的缓存,包括每个文件记录的开始及结束时间,目的为加快后面读取原始数据的速度 + flag_data_augmentation=False + ################################ + #此段代码是为数据增强,raw_spectral_data是二维数据,代表多个光谱曲线,使每个光谱曲线都对应相同的label。 - Args: - raw_spectral_data_dir (str): 原始光谱所在路径 + if flag_data_augmentation: - Returns: - list: 缓存,其中有多少个成员,就有多少个原始数据文件,每个成员包括格式为datetime的开始及结束时间,及文件路径 - """ - spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv")) - cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl")) + if raw_spectral_data is not None: - update_flag=False + for i in range(raw_spectral_data.shape[0]): + np.savez(pre_dataset_save_dir/dataset_type/f"{name[:-4]}_{i}.npz",furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data[i,...],rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"]) - #不存在缓存文件,以及缓存文件中数据文件个数与文件夹中的文件个数不一致,均重新生成 - if len(cache_file_paths)==0: - logging.debug(f"Raw spectral data cache is not existed! Generating") - update_flag=True - elif len(cache_file_paths)==1: - with open(cache_file_paths[0],"rb") as f: - raw_spectral_data_cache=pickle.load(f) - if len(raw_spectral_data_cache) !=len(spectral_file_paths): - logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}") - update_flag=True - else: - logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}") - if update_flag: - spectral_file_paths.sort() - raw_spectral_data_cache=[] - for file in spectral_file_paths: - tmp_info={} - tmp_data=np.loadtxt(file, delimiter=",") - start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000) - end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000) - tmp_info["start_t"]=start_t - tmp_info["end_t"]=end_t - tmp_info["file_path"]=file - raw_spectral_data_cache.append(tmp_info) - with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f: - pickle.dump(raw_spectral_data_cache,f) - return raw_spectral_data_cache - def _get_spectral_spatial_dim(self,raw_spectral_data_cache): - data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64) - if data[0,2]==229376: - spectral_dim =224 - spatial_dim=512 - if data[0,2]==917504: - spectral_dim =448 - spatial_dim=1024 - return spectral_dim, spatial_dim - - def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray: - """获取从start_time到end_time的光谱数据 - Args: - start_time (datetime.datetime): 开始时间 - end_time (datetime.datetime): 结束时间 - - Returns: - np.ndarray: 原始光谱数据 - """ - - - def get_spectral_data_per_file(file_path,s_t,e_t): - data=np.loadtxt(file_path, delimiter=",").astype(np.uint64) - - if s_t is not None: - tmp_s=datetime.datetime.timestamp(s_t)*1000 - tmp_s_index=0 - for i in range(data.shape[0]-1): - if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s: - tmp_s_index=i - break - else: - tmp_s_index=0 - if e_t is not None: - tmp_e=datetime.datetime.timestamp(e_t)*1000 - tmp_e_index=data.shape[0] - for i in range(tmp_s_index,data.shape[0]-1): - if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e: - tmp_e_index=i - break - else: - tmp_e_index=data.shape[0] - - with open(file_path[:-3]+"bin", "rb") as f: - f.seek(data[tmp_s_index,1]) - d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2])) - d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim) - return data[tmp_s_index:tmp_e_index,0],d - - timestamps=None - raw_spectral_data=None - for tmp_info in self.raw_spectral_data_cache: - tmp_data=None - if start_timetmp_info["end_t"] and end_timetmp_info["end_t"]: - # 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据 - tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None) - elif start_time>tmp_info["start_t"] and end_timetmp_info["start_t"] and start_timetmp_info["end_t"]: - # 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据 - tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None) - if tmp_data is not None: - if raw_spectral_data is None: - timestamps=tmp_time_stamp - raw_spectral_data=tmp_data + ############################### else: - timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0) - raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0) - return timestamps,raw_spectral_data - - def _raw_data_2_dataset(self): + if raw_spectral_data is not None: + np.savez(pre_dataset_save_dir/dataset_type/name,furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data,rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"]) - save_dir=os.path.join(self.dataset_dir,self.choose_frame_spatial.description) + ##################### - #第一步,进行特征时间与空间点选取,并保存 + def _pre_dataset_2_dataset(self,pre_dataset_save_dir,dataset_save_dir,dataset_type="test",savaFlag=True): + + print(f"[预处理]: {dataset_type}数据集进行数据预处理") + pre_dataset_files=os.listdir(pre_dataset_save_dir/dataset_type) + - pre_dataset_save_dir=os.path.join(save_dir,"data") - if not os.path.exists(pre_dataset_save_dir): - os.makedirs(pre_dataset_save_dir) - - #数据路径不存在就重新生成,如果存在,就跳过这部分。 - # !!!!!!!!!!因此需要额外注意,如果有新数据,需要把dataset下的文件夹都删除,相当于删除缓存,重新生成 - for i in range(self.raw_labels.shape[0]): - - start_time,end_time=self.choose_frame_spatial.get_data_interval(self.raw_labels.iloc[i]["TSC_start_time"],self.raw_labels.iloc[i]["TSC_end_time"]) - timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time) - - if raw_spectral_data is not None: - #获取到的数据帧率大于2才开始记录 - if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds(): - logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames") - raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data) - np.savez(os.path.join(pre_dataset_save_dir,f"{timestamps[0]}_{timestamps.shape[0]}.npz",),timestamps=timestamps,raw_spectral_data=raw_spectral_data,raw_labels=self.raw_labels.iloc[i][self.labels_name].to_numpy()) - else: - logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}") - - #第二步,进行标准化,并形成数据集 - - self.dataset_save_dir=os.path.join(save_dir,f"{self.features_scaling.description}_{self.labels_scaling.description}") - - if not os.path.exists(self.dataset_save_dir): - os.makedirs(self.dataset_save_dir) - pre_dataset_files=os.listdir(pre_dataset_save_dir) - - if self.features_scaling.flag_get_global_info: + if dataset_type=="training" and (self.features_scaling.flag_get_global_info or self.labels_scaling.flag_get_global_info): for name in pre_dataset_files: - tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True) - self.features_scaling.get_global_info(tmp_data["raw_spectral_data"]) + tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True) + if self.labelNames is None: + self.labelNames=tmp_data["labelNames"] + if self.features_scaling.flag_get_global_info: + self.features_scaling.get_global_info(tmp_data["rawSpectralData"]) if self.labels_scaling.flag_get_global_info: - self.labels_scaling.get_global_info(tmp_data["raw_labels"]) - - # 获取维度信息 - tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True) - feature_dim=tmp_data["raw_spectral_data"].shape - label_dim=tmp_data["raw_labels"].shape + self.labels_scaling.get_global_info(tmp_data["rawLabels"]) + np.set_printoptions(suppress = True) + # print(f"[预处理]Label全局信息:_num",self.labels_scaling._num,"_mean:",self.labels_scaling._mean,"_var:",self.labels_scaling._var,"_min:",self.labels_scaling._min,"_max:",self.labels_scaling._max) + + if savaFlag: + + tmp_data=np.load(pre_dataset_save_dir/dataset_type/pre_dataset_files[0],allow_pickle=True) + feature_dim=tmp_data["rawSpectralData"].shape + label_dim=tmp_data["rawLabels"].shape - #划分训练集_验证集_测试集, - np.random.shuffle(pre_dataset_files) - [train_dateset_files,validate_dateset_files]=np.split(pre_dataset_files,[int(len(pre_dataset_files)*self.train_ratio)]) - [validate_dateset_files,test_dateset_files]=np.split(validate_dateset_files,[int(len(pre_dataset_files)*self.validate_ratio)]) - - - #写入HDF5文件 - for dataset_name in ["train","validate","test"]: - if dataset_name=="train": - file_names=train_dateset_files - elif dataset_name=="validate": - file_names=validate_dateset_files - elif dataset_name=="test": - file_names=test_dateset_files - - logging.info(f"Generating {dataset_name} dataset with {len(file_names)} samples") - with h5py.File(os.path.join( self.dataset_save_dir,f"{dataset_name}.h5"), 'w') as f: + with h5py.File(os.path.join( dataset_save_dir,f"{dataset_type}.h5"), 'w') as f: h5_features = f.create_dataset( 'features', - tuple([len(file_names)]+list(feature_dim)), + tuple([len(pre_dataset_files)]+list(feature_dim)), maxshape=(tuple([None]+list(feature_dim))), chunks=tuple([1]+list(feature_dim)), # 手动设置 chunk 大小为一次数据的大小tuple([1]+list(feature_dim)) # compression='gzip', @@ -299,52 +112,104 @@ class DataPreProcess: ) h5_labels = f.create_dataset( 'labels', - tuple([len(file_names)]+list(label_dim)), + tuple([len(pre_dataset_files)]+list(label_dim)), chunks=tuple([1]+list(label_dim)), # 手动设置 chunk 大小为一次数据的大小 dtype=np.float32, ) - for i,name in enumerate(file_names): - tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True) + for i,name in enumerate(pre_dataset_files): + tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True) - feature=self.features_scaling.run(tmp_data["raw_spectral_data"]) - label=self.labels_scaling.run(tmp_data["raw_labels"]) + feature=self.features_scaling.run(tmp_data["rawSpectralData"]) + label=self.labels_scaling.run(tmp_data["rawLabels"]) h5_features[i]=feature.astype(np.float32) h5_labels[i]=label.astype(np.float32) - else: - pre_dataset_files=os.listdir(pre_dataset_save_dir) + - if self.labels_scaling.flag_get_global_info: - for name in pre_dataset_files: - tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True) - self.labels_scaling.get_global_info(tmp_data["raw_labels"]) + def run(self): + + save_dir=self.dataset_dir/self.choose_frame_spatial.description + + #第一步,进行特征时间与空间点选取,并保存 + + pre_dataset_save_dir=save_dir/"pre_dataset" + + if not os.path.exists(pre_dataset_save_dir): + os.makedirs(pre_dataset_save_dir) + + #数据路径不存在就重新生成,如果存在,就跳过这部分。 + # !!!!!!!!!!因此需要额外注意,如果有新数据,需要把dataset下的文件夹都删除,相当于删除缓存,重新生成 + self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="training") + self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="validation") + self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="test") + + else: + logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}") + + #第二步,进行标准化,并形成数据集 + + self.dataset_save_dir=save_dir / f"{self.features_scaling.description}_{self.labels_scaling.description}" + + if not os.path.exists(self.dataset_save_dir): + os.makedirs(self.dataset_save_dir) + self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=True) + self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=True) + self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=True) + + + else: + + self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=False) + self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=False) + self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=False) logging.info(f"Standardized Dataset is existed in {self.dataset_save_dir}") def get_metric(self,outputs,labels): + # print("outputs Before",outputs) outputs=self.labels_scaling.reverse(outputs) + # print("outputs After",outputs) + # print("labels Before",labels) labels=self.labels_scaling.reverse(labels) + # print("labels After",labels) error=outputs-labels - # print("outputs",outputs) - # print("labels",labels) - # print("errors",error) - hit_rate=np.zeros(error.shape[1]) - for i,name in enumerate(self.labels_name): - # error[:,i]=outputs[:,i]-labels[:,i] + if len(error.shape)==1: + hit_rate=np.zeros((1)) + else: + hit_rate=np.zeros(error.shape[1]) - #["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"] + bounds=np.zeros(error.shape) - if name =="TSC_T": - bound=10 - elif name=="TSC_C": - bound=0.05 - elif name=="TSC_C_lab": - bound=0.05 - elif name=="TSC_P_lab": - bound=0.0005 + # ["Temperature","C","P","S","Mn","Ni","Mo","Cr"] + + + for i,name in enumerate(self.labelNames): + if name =="Temperature": + bounds[:,i]=10.0+np.zeros(labels[:,i].shape) + elif name=="C": + bounds[:,i]=0.05+np.zeros(labels[:,i].shape) + elif name=="P": + bounds[:,i]=0.005+np.zeros(labels[:,i].shape) + elif name=="S": + bounds[:,i]=0.01+np.zeros(labels[:,i].shape) + elif name=="Mn": + bounds[:,i]=0.05+np.zeros(labels[:,i].shape) + elif name=="Ni": + bounds[:,i]=0.25*labels[:,i] + elif name=="Mo": + bounds[:,i]=0.25*labels[:,i] + elif name=="Cr": + bounds[:,i]=0.25*labels[:,i] + + # if isinstance(bound, float): + + # print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound}") + # else: + # print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound[0]}") - hit_rate[i]=((error[:,i]>=-bound) &(error[:,i]<=bound)).sum() /error.shape[0] - return error,hit_rate + hit_rate[i]=((error[:,i]>=-bounds[:,i]) &(error[:,i]<=bounds[:,i])).sum() /error.shape[0] + + return outputs, labels,error,hit_rate,bounds @@ -355,19 +220,13 @@ if __name__=="__main__": logging.basicConfig(level = logging.DEBUG) - raw_data_dir="/code/admin/20240806-NanEr-5-8-data/rawdata" - labels_path="/code/admin/20240806-NanEr-5-8-data/labels/NanEr" - dataset_dir="/code/admin/20240806-NanEr-5-8-data/dataset" - labels_name=["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"] + datasetDir="/data/SEMS-model-training/dataset" - from choose_frame_spatial.mean import ChooseFrameSpatial - from features_scaling.max_min import FeatureScaling - from labels_scaling.max_min import LabelScaling - choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30]) + from choose_frame_spatial.mean_CARS100_3 import ChooseFrameSpatial + from features_scaling.standardization import FeatureScaling + from labels_scaling.standardization import LabelScaling + choose_frame_spatial=ChooseFrameSpatial() features_scaling=FeatureScaling() labels_scaling=LabelScaling() - - - - data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir,choose_frame_spatial,features_scaling,labels_scaling) \ No newline at end of file + data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling) \ No newline at end of file diff --git a/src/data/spectral_dataset.py b/src/data/spectral_dataset.py index 24f26ab..5834748 100644 --- a/src/data/spectral_dataset.py +++ b/src/data/spectral_dataset.py @@ -4,10 +4,10 @@ import os import time -def load_dataset(dataset_dir): - train_dataset=SpectralDataset(os.path.join(dataset_dir,"train.h5")) - val_dataset=SpectralDataset(os.path.join(dataset_dir,"validate.h5")) - return train_dataset, val_dataset +def load_dataset(dataset_dir,dataset_type="training"): + dataset=SpectralDataset(os.path.join(dataset_dir,f"{dataset_type}.h5")) + # val_dataset=SpectralDataset(os.path.join(dataset_dir,"validation.h5")) + return dataset class SpectralDataset(torch.utils.data.Dataset): def __init__(self, hdf5_path): @@ -17,8 +17,8 @@ class SpectralDataset(torch.utils.data.Dataset): # rdcc_w0=0, # 缓存淘汰策略。值越接近 0,优先淘汰最近最少使用的 chunk。值越接近 1,优先淘汰已完全读取或写入的块 # rdcc_nslots=1e8, # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍,为达最佳性能需要 100 倍。默认为 521 ) - self.features=self.hdf5_f["features"] - self.labels=self.hdf5_f["labels"] + self.features=self.hdf5_f["features"][...] + self.labels=self.hdf5_f["labels"][...] def __len__(self): return self.features.shape[0] @@ -26,9 +26,14 @@ class SpectralDataset(torch.utils.data.Dataset): #构造Input + feature=self.features[idx] label=self.labels[idx] + + + + return feature, label @@ -37,7 +42,7 @@ class SpectralDataset(torch.utils.data.Dataset): if __name__ =="__main__": - training_data=SpectralDataset("/home/admin/20240806-NanEr-5-8-data/dataset/mean_-30_30/max_min_max_min/train.h5") + training_data=SpectralDataset("/data/SEMS-model-training/dataset/ChooseFrameSpatial'>_-30_30/max_min_max_min/training.h5") print("数据集大小为:",len(training_data)) print("输入Feature维度为:", training_data[0][0].shape, "输出Label维度为:",training_data[0][1].shape) diff --git a/src/data/test.ipynb b/src/data/test.ipynb new file mode 100644 index 0000000..b72a217 --- /dev/null +++ b/src/data/test.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[预处理]: training数据集进行数据预处理\n", + "[预处理]Label全局信息:_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n", + " 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n", + " 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n", + " 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n", + " 0.522 ]\n", + "[预处理]: validation数据集进行数据预处理\n", + "[预处理]Label全局信息:_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n", + " 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n", + " 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n", + " 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n", + " 0.522 ]\n", + "[预处理]: test数据集进行数据预处理\n", + "[预处理]Label全局信息:_num 549 _mean: [1591.5325 0.39140984 0.04271309 0.02187012 0.14065096\n", + " 0.02857196 0.00202732 0.10250819] _var: [786.9018 0.01670736 0.00040787 0.0000213 0.00179902\n", + " 0.00326453 0.00004455 0.00067495] _min: [1480. 0.079 0.0005 0.0003 0.0005 0.001 0.001\n", + " 0.001 ] _max: [1666. 0.829 0.0957 0.044 0.7264 0.823 0.152\n", + " 0.522 ]\n" + ] + } + ], + "source": [ + "from pre_process import DataPreProcess\n", + "\n", + "\n", + "datasetDir=\"/data/SEMS-model-training/dataset\"\n", + "\n", + "\n", + "from choose_frame_spatial.DBSCAN import ChooseFrameSpatial\n", + "from features_scaling.standardization import FeatureScaling\n", + "from labels_scaling.standardization import LabelScaling\n", + "choose_frame_spatial=ChooseFrameSpatial()\n", + "features_scaling=FeatureScaling()\n", + "labels_scaling=LabelScaling()\n", + "data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1591.5325 , 0.39140984, 0.04271309, 0.02187012,\n", + " 0.14065096, 0.02857196, 0.00202732, 0.10250819],\n", + " dtype=float32)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels_scaling._mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n", + "Scaled: [0.7486169269676385, 0.46089706984319634, 1.2160017876067477, 7.584260858761072, 2.26002739629723, 0.612572700572906, 0.21319305953525988, 13.385111606844243]\n", + "Reversed\t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n", + "Before \t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n" + ] + } + ], + "source": [ + "labels= [1612.5325 , 0.450984, 0.067271309 , 0.05687012 , 0.2365096, 0.06357196 , 0.0034502732 , 0.450250819]\n", + "print(f\"Before: {labels}\")\n", + "tmp=labels_scaling.run(labels)\n", + "print(f\"Scaled: {list(tmp)}\")\n", + "labels_r=labels_scaling.reverse(tmp)\n", + "print(f\"Reversed\\t: {list(labels_r)}\")\n", + "print(f\"Before \\t: {labels}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/main_train.py b/src/main_train.py index 886ca7a..85a6ccc 100644 --- a/src/main_train.py +++ b/src/main_train.py @@ -5,6 +5,10 @@ import ray,os import ray.tune from functools import partial +import shutil + +import subprocess + @@ -24,6 +28,16 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None: hydra_output_dir=hydra_output_dir['runtime']['output_dir'] + command = f""" + sudo kill -9 $(sudo lsof -t -i :6006); + tensorboard --logdir {os.path.join(hydra_output_dir,"ray-tune")} --host 0.0.0.0 --port 6006 + """ + + + # 创建子进程 + tensorboard_subprocess = subprocess.Popen(command, shell=True, executable='/bin/bash',start_new_session=True) + + #获取当前超参数配置 tune_cfg={} for key in hydra_cfg.ray_tune.config.keys(): @@ -44,10 +58,24 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None: ) #保存最好的模型 + best_trial = result.get_best_trial("val_loss", "min", "last") + + + + print(f"Best trial config: {best_trial.config}") print(f"Best trial final validation loss: {best_trial.last_result['val_loss']}") + print(f"Best trial checkpoint path: {best_trial.checkpoint.path}") + + shutil.copytree(best_trial.checkpoint.path, os.path.join(hydra_output_dir,"best-model")) + + # tensorboard_subprocess.terminate() + # tensorboard_subprocess.wait() + # print("TensorBoard terminated") + + diff --git a/src/model/DRSN-CW.py b/src/model/DRSN-CW.py new file mode 100644 index 0000000..936c24a --- /dev/null +++ b/src/model/DRSN-CW.py @@ -0,0 +1,160 @@ +import torch +import torch.nn as nn + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_channels, out_channels, stride=1): + super().__init__() + self.shrinkage = Shrinkage(out_channels, gap_size=(1)) + # residual function + self.residual_function = nn.Sequential( + nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False), + nn.BatchNorm1d(out_channels), + nn.ReLU(inplace=True), + nn.Conv1d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False), + nn.BatchNorm1d(out_channels * BasicBlock.expansion), + self.shrinkage + ) + # shortcut + self.shortcut = nn.Sequential() + + # the shortcut output dimension is not the same with residual function + # use 1*1 convolution to match the dimension + if stride != 1 or in_channels != BasicBlock.expansion * out_channels: + self.shortcut = nn.Sequential( + nn.Conv1d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm1d(out_channels * BasicBlock.expansion) + ) + + def forward(self, x): + + return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) + # a = self.residual_function(x), + # b = self.shortcut(x), + # c = a+b + # return c + + +class Shrinkage(nn.Module): + def __init__(self, channel, gap_size): + super(Shrinkage, self).__init__() + self.gap = nn.AdaptiveAvgPool1d(gap_size) + self.fc = nn.Sequential( + nn.Linear(channel, channel), + nn.ReLU(inplace=True), + nn.Linear(channel, channel), + nn.Sigmoid(), + ) + + def forward(self, x): + x_raw = x + x = torch.abs(x) + x_abs = x + x = self.gap(x) + x = torch.flatten(x, 1) + # average = torch.mean(x, dim=1, keepdim=True) #CS + average = x #CW + x = self.fc(x) + x = torch.mul(average, x) + x = x.unsqueeze(2) + # soft thresholding + sub = x_abs - x + zeros = sub - sub + n_sub = torch.max(sub, zeros) + x = torch.mul(torch.sign(x_raw), n_sub) + return x + + +class SpectralModel(nn.Module): + + def __init__(self, block=BasicBlock, num_block=[3, 4, 6, 3], num_classes=8): + super().__init__() + + self.in_channels = 64 + + self.conv1 = nn.Sequential( + nn.Conv1d(1, 64, kernel_size=3, padding=1, bias=False), + nn.BatchNorm1d(64), + nn.ReLU(inplace=True)) + # we use a different inputsize than the original paper + # so conv2_x's stride is 1 + self.conv2_x = self._make_layer(block, 64, num_block[0], 1) + self.conv3_x = self._make_layer(block, 128, num_block[1], 2) + self.conv4_x = self._make_layer(block, 256, num_block[2], 2) + self.conv5_x = self._make_layer(block, 512, num_block[3], 2) + self.avg_pool = nn.AdaptiveAvgPool1d((1)) + # self.fc = nn.Linear(512 * block.expansion, 1024) + # self.fc = nn.Linear(1024 , 512) + # self.fc = nn.Linear(512, 512) + self.fc = nn.Sequential( + nn.BatchNorm1d(512), + nn.Linear(512 * block.expansion, 1024), + nn.ReLU(inplace=True),nn.BatchNorm1d(1024), + nn.Linear(1024 , 512), + nn.ReLU(inplace=True),nn.BatchNorm1d(512), + nn.Linear(512, 128), + nn.ReLU(inplace=True),nn.BatchNorm1d(128), + nn.Linear(128, 8),) + + def _make_layer(self, block, out_channels, num_blocks, stride): + """make rsnet layers(by layer i didnt mean this 'layer' was the + same as a neuron netowork layer, ex. conv layer), one layer may + contain more than one residual shrinkage block + + Args: + block: block type, basic block or bottle neck block + out_channels: output depth channel number of this layer + num_blocks: how many blocks per layer + stride: the stride of the first block of this layer + + Return: + return a rsnet layer + """ + + # we have num_block blocks per layer, the first block + # could be 1 or 2, other blocks would always be 1 + strides = [stride] + [1] * (num_blocks - 1) + layers = [] + for stride in strides: + layers.append(block(self.in_channels, out_channels, stride)) + self.in_channels = out_channels * block.expansion + + return nn.Sequential(*layers) + + def forward(self, x): + x=torch.unsqueeze(x,1) + output = self.conv1(x) + output = self.conv2_x(output) + output = self.conv3_x(output) + output = self.conv4_x(output) + output = self.conv5_x(output) + output = self.avg_pool(output) + output = output.view(output.size(0), -1) + output = self.fc(output) + + return output + + +# def rsnet18(): +# """ return a RSNet 18 object +# """ +# return RSNet(BasicBlock, [2, 2, 2, 2]) + + +# def rsnet34(): +# """ return a RSNet 34 object +# """ +# return RSNet(BasicBlock, [3, 4, 6, 3]) + +if __name__=='__main__': + import torchinfo + model = SpectralModel() + model.eval() + # print(model) + input = torch.randn(64,224) + y = model(input) + print(y.size()) + + torchinfo.summary(model,input_size=(64,224)) diff --git a/src/model/__pycache__/CNN_FCNN_big.cpython-312.pyc b/src/model/__pycache__/CNN_FCNN_big.cpython-312.pyc new file mode 100644 index 0000000..29f9112 Binary files /dev/null and b/src/model/__pycache__/CNN_FCNN_big.cpython-312.pyc differ diff --git a/src/model/__pycache__/CNN_FCNN_small.cpython-312.pyc b/src/model/__pycache__/CNN_FCNN_small.cpython-312.pyc new file mode 100644 index 0000000..5d2c64f Binary files /dev/null and b/src/model/__pycache__/CNN_FCNN_small.cpython-312.pyc differ diff --git a/src/model/__pycache__/CNN_LSTM_FCNN.cpython-312.pyc b/src/model/__pycache__/CNN_LSTM_FCNN.cpython-312.pyc index bc5094b..a726814 100644 Binary files a/src/model/__pycache__/CNN_LSTM_FCNN.cpython-312.pyc and b/src/model/__pycache__/CNN_LSTM_FCNN.cpython-312.pyc differ diff --git a/src/model/__pycache__/DRSN-CW.cpython-312.pyc b/src/model/__pycache__/DRSN-CW.cpython-312.pyc new file mode 100644 index 0000000..5b0bb35 Binary files /dev/null and b/src/model/__pycache__/DRSN-CW.cpython-312.pyc differ diff --git a/src/model/__pycache__/FCNN_DBSCAN_big.cpython-312.pyc b/src/model/__pycache__/FCNN_DBSCAN_big.cpython-312.pyc new file mode 100644 index 0000000..3e9865b Binary files /dev/null and b/src/model/__pycache__/FCNN_DBSCAN_big.cpython-312.pyc differ diff --git a/src/model/__pycache__/FCNN_DBSCAN_small.cpython-312.pyc b/src/model/__pycache__/FCNN_DBSCAN_small.cpython-312.pyc new file mode 100644 index 0000000..61862ce Binary files /dev/null and b/src/model/__pycache__/FCNN_DBSCAN_small.cpython-312.pyc differ diff --git a/src/model/__pycache__/FNN_CARS.cpython-312.pyc b/src/model/__pycache__/FNN_CARS.cpython-312.pyc new file mode 100644 index 0000000..6226486 Binary files /dev/null and b/src/model/__pycache__/FNN_CARS.cpython-312.pyc differ diff --git a/src/model/__pycache__/FNN_CARS_big.cpython-312.pyc b/src/model/__pycache__/FNN_CARS_big.cpython-312.pyc new file mode 100644 index 0000000..6dc2661 Binary files /dev/null and b/src/model/__pycache__/FNN_CARS_big.cpython-312.pyc differ diff --git a/src/model/__pycache__/FNN_CARS_small.cpython-312.pyc b/src/model/__pycache__/FNN_CARS_small.cpython-312.pyc new file mode 100644 index 0000000..f919eee Binary files /dev/null and b/src/model/__pycache__/FNN_CARS_small.cpython-312.pyc differ diff --git a/src/model/CNN1D_FCNN.py b/src/model/history/CNN1D_FCNN.py similarity index 100% rename from src/model/CNN1D_FCNN.py rename to src/model/history/CNN1D_FCNN.py diff --git a/src/model/CNN_FCNN.py b/src/model/history/CNN_FCNN_big.py similarity index 89% rename from src/model/CNN_FCNN.py rename to src/model/history/CNN_FCNN_big.py index 2ccf3f7..a19b65c 100644 --- a/src/model/CNN_FCNN.py +++ b/src/model/history/CNN_FCNN_big.py @@ -3,9 +3,9 @@ import torch.nn.functional as F import torchinfo import torch -class CNN_FCNN(nn.Module): +class SpectralModel(nn.Module): def __init__(self,): - super(CNN_FCNN, self).__init__() + super(SpectralModel, self).__init__() self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=512, kernel_size=3,stride=1) self.Conv2d_2=nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3,stride=1) @@ -24,7 +24,7 @@ class CNN_FCNN(nn.Module): self.fc_2=nn.Linear(in_features=2048,out_features=1024) self.fc_3=nn.Linear(in_features=1024,out_features=512) self.fc_4=nn.Linear(in_features=512,out_features=256) - self.fc_5=nn.Linear(in_features=256,out_features=4) + self.fc_5=nn.Linear(in_features=256,out_features=8) # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) @@ -71,6 +71,6 @@ class CNN_FCNN(nn.Module): return x if __name__=="__main__": - model=CNN_FCNN() + model=SpectralModel() - torchinfo.summary(model,input_size=(64, 48, 224, 10)) \ No newline at end of file + torchinfo.summary(model,input_size=(64, 20, 224, 10)) \ No newline at end of file diff --git a/src/model/history/CNN_FCNN_small.py b/src/model/history/CNN_FCNN_small.py new file mode 100644 index 0000000..b98e0dd --- /dev/null +++ b/src/model/history/CNN_FCNN_small.py @@ -0,0 +1,78 @@ +import torch.nn as nn +import torch.nn.functional as F +import torchinfo +import torch + +class SpectralModel(nn.Module): + def __init__(self,): + super(SpectralModel, self).__init__() + + self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3,stride=1) + self.Conv2d_2=nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3,stride=1) + # self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2) + + self.Conv2d_3=nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3,stride=1) + + self.Conv2d_4=nn.Conv2d(in_channels=16, out_channels=8, kernel_size=3,stride=1) + self.Conv2d_5=nn.Conv2d(in_channels=8, out_channels=4, kernel_size=2,stride=1) + self.flatten=nn.Flatten(start_dim=1, end_dim=3) + + + + self.fc_1=nn.Linear(in_features=860,out_features=512) #2048->1024 + + self.fc_2=nn.Linear(in_features=512,out_features=256) + self.fc_3=nn.Linear(in_features=256,out_features=128) + self.fc_4=nn.Linear(in_features=128,out_features=64) + self.fc_5=nn.Linear(in_features=64,out_features=8) + + + # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) + + + + + + def forward(self, x): + # input_shape=x.shape + + x=torch.mean(x,dim=1,keepdim=True) + + # x=torch.unsqueeze(x, 1) + x=F.tanh(self.Conv2d_1(x)) + x=F.tanh(self.Conv2d_2(x)) + # x=self.MaxPool2d_1(x) + x=F.tanh(self.Conv2d_3(x)) + x=F.tanh(self.Conv2d_4(x)) + x=F.tanh(self.Conv2d_5(x)) + + x=self.flatten(x) + print(x.shape) + x=F.tanh(self.fc_1(x)) + x=F.tanh(self.fc_2(x)) + x=F.tanh(self.fc_3(x)) + x=F.tanh(self.fc_4(x)) + x=F.sigmoid(self.fc_5(x)) + + # + # x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x) + # x=nn.Flatten(start_dim=2, end_dim=4)(x) + + + # x,_=self.lstm_1(x) + + # x=x[:,-1,:] + + + # x=nn.LeakyReLU()(self.fc_1(x)) + # x=nn.LeakyReLU()(self.fc_2(x)) + # x=nn.Sigmoid()(self.fc_3(x)) + + + + return x + +if __name__=="__main__": + model=SpectralModel() + + torchinfo.summary(model,input_size=(64, 20, 224, 10)) \ No newline at end of file diff --git a/src/model/CNN_LSTM_FCNN.py b/src/model/history/CNN_LSTM_FCNN.py similarity index 87% rename from src/model/CNN_LSTM_FCNN.py rename to src/model/history/CNN_LSTM_FCNN.py index 078d88c..bcfce58 100644 --- a/src/model/CNN_LSTM_FCNN.py +++ b/src/model/history/CNN_LSTM_FCNN.py @@ -3,9 +3,9 @@ import torch.nn.functional as F import torchinfo import torch -class CNN_LSTM_FCNN(nn.Module): +class SpectralModel(nn.Module): def __init__(self,): - super(CNN_LSTM_FCNN, self).__init__() + super(SpectralModel, self).__init__() # self.reshape_1=nn.Flatten(start_dim=0, end_dim=1) self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=256, kernel_size=3,stride=1) @@ -27,7 +27,7 @@ class CNN_LSTM_FCNN(nn.Module): self.fc_1=nn.Linear(in_features=2048,out_features=1024) #2048->1024 self.fc_2=nn.Linear(in_features=1024,out_features=256) - self.fc_3=nn.Linear(in_features=256,out_features=4) + self.fc_3=nn.Linear(in_features=256,out_features=8) # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) @@ -64,6 +64,6 @@ class CNN_LSTM_FCNN(nn.Module): return x if __name__=="__main__": - model=CNN_LSTM_FCNN() + model=SpectralModel() - torchinfo.summary(model,input_size=(64, 48, 224, 10)) \ No newline at end of file + torchinfo.summary(model,input_size=(64, 20, 224, 10)) \ No newline at end of file diff --git a/src/model/FCNN.py b/src/model/history/FCNN.py similarity index 100% rename from src/model/FCNN.py rename to src/model/history/FCNN.py diff --git a/src/model/history/FCNN_DBSCAN_big.py b/src/model/history/FCNN_DBSCAN_big.py new file mode 100644 index 0000000..0a05df2 --- /dev/null +++ b/src/model/history/FCNN_DBSCAN_big.py @@ -0,0 +1,55 @@ +import torch.nn as nn +import torch.nn.functional as F +import torchinfo +import torch + +class SpectralModel(nn.Module): + def __init__(self,): + super(SpectralModel, self).__init__() + self.norm=nn.BatchNorm1d(224) + self.fc1 = nn.Linear(224, 1024) + self.fc2 = nn.Linear(1024, 2048) + self.fc3 = nn.Linear(2048, 4096) + self.fc4 = nn.Linear(4096, 2048) + self.fc5 = nn.Linear(2048, 1024) + self.fc6 = nn.Linear(1024, 512) + + self.fc7 = nn.Linear(512, 128) + self.fc8 = nn.Linear(128, 8) + + + + def forward(self, x): + x=self.norm(x) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + x = F.relu(self.fc4(x)) + x = F.relu(self.fc5(x)) + x = F.relu(self.fc6(x)) + x = F.relu(self.fc7(x)) + x = F.relu(self.fc8(x)) + x = F.sigmoid(x) + return x +# class SpectralModel(nn.Module): +# def __init__(self,): +# super(SpectralModel, self).__init__() + +# self.norm=nn.BatchNorm1d(224) +# self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)]) + + + + +# def forward(self, x): +# x=self.norm(x) + +# res = [submodel(x) for submodel in self.submodels] +# x=torch.cat(res,dim=1) + +# return x + +if __name__=="__main__": + model=SpectralModel().to("cuda:0") + + torchinfo.summary(model,input_size=(64, 224)) \ No newline at end of file diff --git a/src/model/history/FCNN_DBSCAN_small.py b/src/model/history/FCNN_DBSCAN_small.py new file mode 100644 index 0000000..5b291ae --- /dev/null +++ b/src/model/history/FCNN_DBSCAN_small.py @@ -0,0 +1,68 @@ +import torch.nn as nn +import torch.nn.functional as F +import torchinfo +import torch + +class SmallFCNNModel(nn.Module): + def __init__(self,): + super(SmallFCNNModel, self).__init__() + + self.fc1 = nn.Linear(224, 50) + + self.fc9 = nn.Linear(50, 1) + + + + def forward(self, x): + # x=self.norm(x) + x = F.relu(self.fc1(x)) + x = F.sigmoid(self.fc9(x)) + return x +class SpectralModel(nn.Module): + def __init__(self,): + super(SpectralModel, self).__init__() + + self.norm=nn.BatchNorm1d(224) + # self.fc1 = nn.Linear(100, 50) + # # self.fc2 = nn.Linear(1024, 2048) + # # self.fc3 = nn.Linear(2048, 4096) + # # self.fc4 = nn.Linear(4096, 2048) + # # self.fc5 = nn.Linear(2048, 1024) + # # self.fc6 = nn.Linear(1024, 512) + # # self.fc7 = nn.Linear(512, 256) + # # self.fc8 = nn.Linear(256, 128) + # self.fc9 = nn.Linear(50, 8) + # # self.fc4 = nn.Linear(10240, 4) + self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)]) + + + + + def forward(self, x): + x=self.norm(x) + + res = [submodel(x) for submodel in self.submodels] + # res=[self.submodels[0](x)] + + # for i in range(1,len(self.submodels)): + # res.append(self.submodels[i](0*x)) + + + + + # x = F.relu(self.fc2(x)) + # x = F.relu(self.fc3(x)) + # x = F.relu(self.fc4(x)) + # x = F.relu(self.fc5(x)) + # x = F.relu(self.fc6(x)) + # x = F.relu(self.fc7(x)) + # x = F.relu(self.fc8(x)) + # x = F.sigmoid(self.fc9(x)) + x=torch.cat(res,dim=1) + + return x + +if __name__=="__main__": + model=SpectralModel().to("cuda:0") + + torchinfo.summary(model,input_size=(64, 224)) \ No newline at end of file diff --git a/src/optimizer/__pycache__/trainable.cpython-312.pyc b/src/optimizer/__pycache__/trainable.cpython-312.pyc index d2d8ea4..33eb23d 100644 Binary files a/src/optimizer/__pycache__/trainable.cpython-312.pyc and b/src/optimizer/__pycache__/trainable.cpython-312.pyc differ diff --git a/src/optimizer/__pycache__/utils.cpython-312.pyc b/src/optimizer/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000..61f9072 Binary files /dev/null and b/src/optimizer/__pycache__/utils.cpython-312.pyc differ diff --git a/src/optimizer/trainable.py b/src/optimizer/trainable.py index 753a452..f086196 100644 --- a/src/optimizer/trainable.py +++ b/src/optimizer/trainable.py @@ -22,6 +22,8 @@ import numpy as np import matplotlib.pyplot as plt +from optimizer.utils import save_inference_files + logger = logging.getLogger("ray") def trainable(hydra_cfg,tune_cfg): @@ -39,7 +41,7 @@ def trainable(hydra_cfg,tune_cfg): ## 原始数据预处理为数据集 ## ############################ t_1=time.time() - data_preprocess=DataPreProcess( hydra_cfg.raw_spectral_data_dir,hydra_cfg.raw_labels_dir,hydra_cfg.labels_name,hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]),hydra_cfg.dataset.train_ratio,hydra_cfg.dataset.validate_ratio) + data_preprocess=DataPreProcess( hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"])) t_2=time.time() logger.info(f"Preprocessed raw data costs {t_2-t_1}s") @@ -47,12 +49,18 @@ def trainable(hydra_cfg,tune_cfg): ## 加载数据集为dataloader ## ############################ - train_dataset,val_dataset=load_dataset(data_preprocess.dataset_save_dir) + train_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="training") + val_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="validation") + + + + + trainloader = torch.utils.data.DataLoader( - train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker + train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False ) valloader = torch.utils.data.DataLoader( - val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker + val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False ) t_3=time.time() logger.info(f"Dataloader costs {t_3-t_2}s") @@ -101,7 +109,7 @@ def trainable(hydra_cfg,tune_cfg): ## 生成模型架构图写入tensorboad ## ################################ sample = train_dataset[0:4][0] - print(sample.shape) + # print(sample.shape) writer.add_graph(model, torch.tensor(sample).to(device)) @@ -113,6 +121,7 @@ def trainable(hydra_cfg,tune_cfg): ################################ ## 模型训练 ## ################################ + logger.info(f"Start epoch {epoch+1}") train_loss = 0.0 train_steps = 0 @@ -121,11 +130,28 @@ def trainable(hydra_cfg,tune_cfg): train_errors=None train_outputs=None train_labels=None + train_bounds=None model.train() + + # t_1=time.time() + # for batch, (features, labels) in enumerate(trainloader): + # features=features.to(device) + # labels=labels.to(device) + + # t_2=time.time() + # print(f"DEBUDING cost{t_2-t_1}s") + for batch, (features, labels) in enumerate(trainloader): + + + t_iteration_start=time.time() features=features.to(device) labels=labels.to(device) + + + + # zero the parameter gradients optimizer.zero_grad() @@ -136,9 +162,13 @@ def trainable(hydra_cfg,tune_cfg): labels_npy=labels.cpu().detach().numpy() outputs_npy=outputs.cpu().detach().numpy() + # print("outputs_npy",outputs_npy) + # print("labels_npy",labels_npy) # 生成自定义的指标 - error,hit_rate=data_preprocess.get_metric(outputs_npy, labels_npy) + outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy) + # print("outputs_npy_after",outputs_npy) + # print("labels_npy_after",labels_npy) # Backpropagation loss.backward() @@ -147,16 +177,21 @@ def trainable(hydra_cfg,tune_cfg): # 记录我们自定义的指标 train_loss += loss.item() train_steps += 1 + if train_steps==1: train_hit_rate=hit_rate train_errors=error train_outputs=outputs_npy train_labels=labels_npy + train_bounds=bounds else: + + train_hit_rate=(train_steps-1)/train_steps *train_hit_rate+ 1/train_steps*hit_rate train_errors=np.concatenate((train_errors,error),axis=0) train_outputs=np.concatenate((train_outputs,outputs_npy),axis=0) train_labels=np.concatenate((train_labels,labels_npy),axis=0) + train_bounds=np.concatenate((train_bounds,bounds),axis=0) t_iteration_end=time.time() print("Training Epoch:[{}/{}],Iteration:{}/{},Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_steps,len(trainloader),loss.item(),t_iteration_end-t_iteration_start)) @@ -168,6 +203,9 @@ def trainable(hydra_cfg,tune_cfg): val_loss = 0.0 val_steps = 0 + val_errors=None + val_outputs=None + val_labels=None val_hit_rate=None t_epoch_train_end=time.time() @@ -177,20 +215,30 @@ def trainable(hydra_cfg,tune_cfg): for batch, (features, labels) in enumerate(valloader): t_iteration_start=time.time() with torch.no_grad(): + features=features.to(device) labels=labels.to(device) outputs = model(features) loss = criterion(outputs, labels) - error,hit_rate=data_preprocess.get_metric(outputs.cpu().detach().numpy(), labels.cpu().detach().numpy()) + labels_npy=labels.cpu().detach().numpy() + outputs_npy=outputs.cpu().detach().numpy() + + outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy) val_loss += loss.cpu().numpy() val_steps += 1 if val_steps==1: val_hit_rate=hit_rate + val_errors=error + val_outputs=outputs_npy + val_labels=labels_npy else: val_hit_rate=(val_steps-1)/val_steps *val_hit_rate+ 1/val_steps*hit_rate + val_errors=np.concatenate((val_errors,error),axis=0) + val_outputs=np.concatenate((val_outputs,outputs_npy),axis=0) + val_labels=np.concatenate((val_labels,labels_npy),axis=0) t_iteration_end=time.time() print("Validate Iteration:{}/{},Loss:{}, Cost {}s".format(val_steps,len(valloader),loss.item(),t_iteration_end-t_iteration_start)) @@ -206,41 +254,157 @@ def trainable(hydra_cfg,tune_cfg): - checkpoint_data = { + + + reports={"val_loss": val_loss / val_steps, + "train_loss": train_loss, + } + + for i,name in enumerate(data_preprocess.labelNames): + # reports[= + writer.add_scalar(f"{name}/training/hit_rate",train_hit_rate[i],global_step=epoch) + writer.add_scalar(f"{name}/validation/hit_rate",val_hit_rate[i],global_step=epoch) + + if (epoch!=0 and epoch%hydra_cfg.train.checkpoint_interval==0): + + + for i,name in enumerate(data_preprocess.labelNames): + # reports[= + + writer.add_histogram(tag=f'{name}/training/error_histogram', values=train_errors[:,i], global_step=epoch) + writer.add_histogram(tag=f'{name}/validation/error_histogram', values=val_errors[:,i], global_step=epoch) + + fig, ax = plt.subplots() + ax.scatter(train_labels[:,i],train_outputs[:,i]) + ax.plot([train_labels[:,i].min(), train_labels[:,i].min()], [train_labels[:,i].max(), train_labels[:,i].max()], 'r--') + ax.plot(train_labels[:,i],train_labels[:,i]-train_bounds[:,i], 'r--') + ax.plot(train_labels[:,i],train_labels[:,i]+train_bounds[:,i], 'r--') + + + ax.set_xlabel('Actual Values') + ax.set_ylabel('Estimated Values') + writer.add_figure(f'{name}/training/actual_vs_estimated_value', fig , epoch) + plt.close(fig) + + + fig, ax = plt.subplots() + ax.scatter(val_labels[:,i],val_outputs[:,i]) + ax.plot([val_labels[:,i].min(), val_labels[:,i].max()], [val_outputs[:,i].min(), val_outputs[:,i].max()], 'r--') + ax.set_xlabel('Actual Values') + ax.set_ylabel('Estimated Values') + writer.add_figure(f'{name}/validation/actual_vs_estimated_value', fig , epoch) + plt.close(fig) + + + + fig, ax = plt.subplots() + ax.scatter(train_labels[:,i],train_errors[:,i]) + ax.plot(train_labels[:,i],[0 for i in range(train_labels.shape[0])], color='r', linestyle='--') + ax.plot(train_labels[:,i],-train_bounds[:,i], 'r--') + ax.plot(train_labels[:,i],+train_bounds[:,i], 'r--') + ax.set_xlabel('Actual Values') + ax.set_ylabel('Residual error') + + writer.add_figure(f'{name}/training/training_actual_vs_residual', fig , epoch) + plt.close(fig) + + + + fig, ax = plt.subplots() + ax.scatter(val_labels[:,i],val_errors[:,i]) + ax.plot(val_labels[:,i],[0 for i in range(val_labels.shape[0])], color='r', linestyle='--') + ax.set_xlabel('Actual Values') + ax.set_ylabel('Residual error') + writer.add_figure(f'{name}/validation/training_actual_vs_Residual', fig , epoch) + plt.close(fig) + + with tempfile.TemporaryDirectory() as checkpoint_dir: + checkpoint_data = { "epoch": epoch, "net_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), - } - + "data_preprocess":data_preprocess - with tempfile.TemporaryDirectory() as checkpoint_dir: + } data_path = pathlib.Path(checkpoint_dir) / "data.pkl" with open(data_path, "wb") as fp: pickle.dump(checkpoint_data, fp) + + save_inference_files(model,data_preprocess,checkpoint_dir) - checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir) - - reports={"val_loss": val_loss / val_steps, - "train_loss": train_loss, - } - for i,name in enumerate(hydra_cfg.labels_name): - reports[f"train_hit_rate_{name}"]=train_hit_rate[i] - reports[f"val_hit_rate_{name}"]=val_hit_rate[i] - writer.add_histogram(tag=f'train_error_{name}', values=train_errors[:,i], global_step=epoch) - - fig, ax = plt.subplots() - ax.scatter(np.arange(train_outputs.shape[0]),train_outputs[:,i]) - ax.scatter(np.arange(train_outputs.shape[0]),train_labels[:,i]) - writer.add_figure(f'train_imgage_{name}', fig , epoch) - plt.close(fig) + checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir) ray.train.report( reports, checkpoint=checkpoint, ) + else: + ray.train.report(reports,) t_epoch_end=time.time() logger.info(f"Save Checkpoint costs {t_epoch_end-t_epoch_val_end}s") print("Training Epoch:[{}/{}], Average Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_loss,t_epoch_end-t_epoch_start)) - for i,name in enumerate(hydra_cfg.labels_name): - print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end=" ") - print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end=" ") \ No newline at end of file + # for i,name in enumerate(hydra_cfg.labels_name): + # print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end=" ") + # print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end=" ") + + ##################################################### + ## 每个epoch保存checkpoint,并记录到tensorboard ## + ##################################################### + + #测试 + + test_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="test") + + testloader = torch.utils.data.DataLoader( + test_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False + ) + + + + + test_loss = 0.0 + test_steps = 0 + test_errors=None + test_outputs=None + test_labels=None + test_hit_rate=None + logger.info(f"Test starts") + t_epoch_test_start=time.time() + + model.eval() + + for batch, (features, labels) in enumerate(testloader): + t_iteration_start=time.time() + with torch.no_grad(): + features=features.to(device) + labels=labels.to(device) + + outputs = model(features) + loss = criterion(outputs, labels) + + labels_npy=labels.cpu().detach().numpy() + outputs_npy=outputs.cpu().detach().numpy() + + outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy) + + test_loss += loss.cpu().numpy() + test_steps += 1 + if test_steps==1: + test_hit_rate=hit_rate + test_errors=error + test_outputs=outputs_npy + test_labels=labels_npy + else: + test_hit_rate=(test_steps-1)/test_steps *test_hit_rate+ 1/test_steps*hit_rate + test_errors=np.concatenate((test_errors,error),axis=0) + test_outputs=np.concatenate((test_outputs,outputs_npy),axis=0) + test_labels=np.concatenate((test_labels,labels_npy),axis=0) + t_iteration_end=time.time() + print("Test Iteration:{}/{},Loss:{}, Cost {}s".format(test_steps,len(testloader),loss.item(),t_iteration_end-t_iteration_start)) + + t_epoch_test_end=time.time() + logger.info(f"Test costs {t_epoch_test_end-t_epoch_test_start}s") + tmp_string="" + for i,name in enumerate(data_preprocess.labelNames): + tmp_string+=f"{name}\t:{test_hit_rate[i]*100:.2f}%\n" + writer.add_text(f"test/hit_rate",tmp_string) diff --git a/src/optimizer/utils.py b/src/optimizer/utils.py new file mode 100644 index 0000000..1781560 --- /dev/null +++ b/src/optimizer/utils.py @@ -0,0 +1,48 @@ +import shutil +import pathlib +import pickle +import torch +def save_inference_files(model,data_preprocess,checkpoint_dir): + sava_dir=pathlib.Path(checkpoint_dir)/"inference" + print("开始在checkpoint中保存推理所需文件") + print("choose_frame_spatial文件路径:",data_preprocess.choose_frame_spatial.file_path.parent) + + #保存choose_frame_spatial相关文件 + + + sava_dir.mkdir(mode=0o777, parents=True, exist_ok=True) + + shutil.copy(data_preprocess.choose_frame_spatial.file_path,sava_dir/"choose_frame_spatial.py") + with open(sava_dir/"choose_frame_spatial.pkl",'wb') as f: + pickle.dump(data_preprocess.choose_frame_spatial.state_dict(),f) + + + shutil.copy(data_preprocess.features_scaling.file_path,sava_dir/"features_scaling.py") + with open(sava_dir/"features_scaling.pkl",'wb') as f: + pickle.dump(data_preprocess.features_scaling.state_dict(),f) + + + shutil.copy(data_preprocess.labels_scaling.file_path,sava_dir/"labels_scaling.py") + with open(sava_dir/"labels_scaling.pkl",'wb') as f: + pickle.dump(data_preprocess.labels_scaling.state_dict(),f) + + with open(sava_dir/"labelNames.pkl",'wb') as f: + pickle.dump(data_preprocess.labelNames,f) + + + input_tensor = torch.rand((1,*data_preprocess.features_scaling.feature_shape), dtype=torch.float32).to("cuda:0") + torch.onnx.export( + model, # model to export + (input_tensor,), # inputs of the model, + str(sava_dir/"model.onnx"), # filename of the ONNX model + input_names=["input"], # Rename inputs for the ONNX model + dynamo=True # True or False to select the exporter to use + ) + + + + + + + + diff --git a/src/scripts/Add_Lab.py b/src/scripts/Add_Lab.py new file mode 100644 index 0000000..051d19e --- /dev/null +++ b/src/scripts/Add_Lab.py @@ -0,0 +1,149 @@ +import pandas as pd +# from tqdm import tqdm +if __name__ =="__main__": + + label_file_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335.xlsx" + raw_label_lab_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份转炉TSC和TSO化验结果.xls" + + save_dir="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx" + + labels=pd.read_excel(label_file_path) + + print(f"原始数据量{labels.shape[0]}") + labels=labels.loc[:,["Furnace_Number","Steel_Type","TSC_start_time","TSC_end_time","TSC_T","TSC_C","TSO_start_time","TSO_end_time","TSO_T","TSO_C"]] + + #下述功能集成到数据预处理中。因为每一行但凡有0就删,太严格,应该要求指定的label有0就删 + # # 选出有NULL的行 + null_rows=labels.isnull().any(axis=1) + # # 选出有0的行 + zeros_rows=(labels==0).any(axis=1) | (labels=='0').any(axis=1) + + # # 每一行但凡有NULL或者0都给删了 + selected_rows=~(null_rows|zeros_rows) + labels=labels[selected_rows] + + print(f"删除无效数据后{labels.shape[0]}") + + labels_output=labels.copy() + + labels_output['TSC_P']=0.0 + labels_output['TSC_S']=0.0 + labels_output['TSC_Mn']=0.0 + labels_output['TSC_Ni']=0.0 + labels_output['TSC_Mo']=0.0 + labels_output['TSC_Cr']=0.0 + labels_output['TSO_P']=0.0 + labels_output['TSO_S']=0.0 + labels_output['TSO_Mn']=0.0 + labels_output['TSO_Ni']=0.0 + labels_output['TSO_Mo']=0.0 + labels_output['TSO_Cr']=0.0 + + + + + + + raw_label_lab=pd.read_excel(raw_label_lab_file_path) + raw_label_lab=raw_label_lab.loc[:,["炉次号","分析时间","Mn","P","S","Ni","Cr","Mo"]] + # print(raw_label_lab) + + for i in range(labels.shape[0]): + # i=4 + furnaceID=labels.iloc[i]["Furnace_Number"] + + # print(raw_label_lab[raw_label_lab["炉次号"]==furnaceID]) + + tmp=raw_label_lab[raw_label_lab["炉次号"]==furnaceID] + + if tmp.shape[0]==0: + print(f"炉次号{furnaceID}找不到对应的数据") + continue + elif tmp.shape[0]==1: + print(f"炉次号{furnaceID}找到1个") + labels_output.loc[i,'TSC_P']=tmp.iloc[0]["P"] + labels_output.loc[i,'TSC_S']=tmp.iloc[0]["S"] + labels_output.loc[i,'TSC_Mn']=tmp.iloc[0]["Mn"] + labels_output.loc[i,'TSC_Ni']=tmp.iloc[0]["Ni"] + labels_output.loc[i,'TSC_Mo']=tmp.iloc[0]["Mo"] + labels_output.loc[i,'TSC_Cr']=tmp.iloc[0]["Cr"] + labels_output.loc[i,'TSO_P']=tmp.iloc[0]["P"] + labels_output.loc[i,'TSO_S']=tmp.iloc[0]["S"] + labels_output.loc[i,'TSO_Mn']=tmp.iloc[0]["Mn"] + labels_output.loc[i,'TSO_Ni']=tmp.iloc[0]["Ni"] + labels_output.loc[i,'TSO_Mo']=tmp.iloc[0]["Mo"] + labels_output.loc[i,'TSO_Cr']=tmp.iloc[0]["Cr"] + else: + print(f"炉次号{furnaceID}找到{tmp.shape[0]}个") + + #寻找离TSC最近的 + min_time=None + min_index=0 + for j in range(tmp.shape[0]): + # print(j,i) + delta_time=tmp.iloc[j]["分析时间"]-labels.iloc[i]["TSC_end_time"] + if min_time is None: + min_time=delta_time + else: + if delta_timeNone: + + output_folder=output_folder + output_folder.mkdir(mode=0o777,parents=True,exist_ok=True) + + data=np.load(file_path,allow_pickle=True) + #此处data["rawSpectralData"]的维度为(时间维度,光谱维度,空间维度) + rawSpectralData=data["rawSpectralData"] + + + + plt.figure() + spectrum_band=np.linspace(400,1000,224) + # plt.ylim([0,4095]) + plt.plot(spectrum_band,rawSpectralData) + # plt.title("Max Point Spectrum") + plt.savefig(output_folder/f"{file_path.stem}.png",dpi=100) + plt.close() + #画最强点的光谱 + + + +def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None: + + files=list(raw_data_folder.glob("*.npz")) + + # plot_npz(files[0],output_folder ) + + p=multiprocessing.Pool(4) + + for i in range(len(files)): + p.apply_async(plot_npz, args=(files[i],output_folder)) + + print('Waiting for all subprocesses done...') + p.close() + p.join() + print('All subprocesses done.') + +if __name__=="__main__": + raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset") + output_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset_visual") + main(raw_dataset_folder/"test",output_folder/"test",) + main(raw_dataset_folder/"training",output_folder/"training",) + main(raw_dataset_folder/"validation",output_folder/"validation",) \ No newline at end of file diff --git a/src/scripts/raw_dataset_visual.py b/src/scripts/raw_dataset_visual.py new file mode 100644 index 0000000..74865e0 --- /dev/null +++ b/src/scripts/raw_dataset_visual.py @@ -0,0 +1,265 @@ +import multiprocessing.pool +import pathlib +import os +import numpy as np +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA +from sklearn.decomposition import KernelPCA + +from sklearn.manifold import TSNE +from sklearn.cluster import DBSCAN +from sklearn import metrics +from sklearn.cluster import OPTICS + +import multiprocessing + + +def plot_npz(file_path:pathlib.Path,output_folder:pathlib.Path)->None: + + output_folder=output_folder/file_path.stem + output_folder.mkdir(mode=0o777,parents=True,exist_ok=True) + + data=np.load(file_path,allow_pickle=True) + #此处data["rawSpectralData"]的维度为(时间维度,光谱维度,空间维度) + rawSpectralData=data["rawSpectralData"].transpose(0, 2, 1) + rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2])) + #rawSpectralData的维度为(时间维度*空间维度,光谱维度) + + tmp=np.max(rawSpectralData,axis=1) + rawSpectralData=rawSpectralData[(tmp>500) & (tmp<4095),:] + #选出没有欠曝与过曝的空间点 + + + rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True)) + #归一化所有光谱,去除强度信息 + + + plt.figure() + spectrum_band=np.linspace(400,1000,224) + index=np.argmax(rawSpectralData) + index=np.unravel_index(index, rawSpectralData.shape) + plt.ylim([0,4095]) + plt.plot(spectrum_band,rawSpectralData[index[0],:]) + plt.title("Max Point Spectrum") + plt.savefig(output_folder/"max_point_spectrum.png",dpi=100) + plt.close() + #画最强点的光谱 + + + + ################################ + # PCA + ################################ + # pca_norm_1D = PCA(n_components=1) + # norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed) + # pca_norm_2D = PCA(n_components=2) + # norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed) + # pca_norm_3D = PCA(n_components=3) + # norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed) + + # pca_1D = PCA(n_components=1) + # feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData) + # pca_2D = PCA(n_components=2) + # feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData) + # pca_3D = PCA(n_components=3) + # feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData) + + + # fig, axs = plt.subplots(2, 3,figsize=(15,10)) + + # axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1) + # axs[0, 0].set_title(' norm') + # axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1) + # ax_3d = fig.add_subplot(2, 3, 3, projection='3d') + # ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01) + + + # axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1) + # axs[1, 0].set_title(' raw') + # axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1) + # ax_3d = fig.add_subplot(2, 3, 6, projection='3d') + # ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01) + # plt.savefig(output_folder/"PCA.png",dpi=100) + + # plt.close() + + + + ################################ + # KernelPCA gamma =15 + ################################ + + def plot_KernelPCA(gamma): + pca_norm_1D = KernelPCA(n_components=1, kernel='rbf', gamma=gamma) + norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed) + pca_norm_2D = KernelPCA(n_components=2, kernel='rbf', gamma=gamma) + norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed) + pca_norm_3D = KernelPCA(n_components=3, kernel='rbf', gamma=gamma) + norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed) + + pca_1D = KernelPCA(n_components=1, kernel='rbf', gamma=gamma) + feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData) + pca_2D = KernelPCA(n_components=2, kernel='rbf', gamma=gamma) + feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData) + pca_3D = KernelPCA(n_components=3, kernel='rbf', gamma=gamma) + feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData) + + + fig, axs = plt.subplots(2, 3,figsize=(15,10)) + + axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1) + axs[0, 0].set_title(' norm') + axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1) + ax_3d = fig.add_subplot(2, 3, 3, projection='3d') + ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01) + + + axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1) + axs[1, 0].set_title(' raw') + axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1) + ax_3d = fig.add_subplot(2, 3, 6, projection='3d') + ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01) + plt.savefig(output_folder/f"KernelPCA_gamma_{gamma}.png",dpi=100) + + plt.close() + + # plot_KernelPCA(gamma=None) + # plot_KernelPCA(gamma=1) + # plot_KernelPCA(gamma=5) + # plot_KernelPCA(gamma=10) + # plot_KernelPCA(gamma=15) + + + + ################################ + # t-SNE + ################################ + # pca_norm_1D = TSNE(n_components=1, learning_rate='auto', init='pca') + # norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed) + # pca_norm_2D = TSNE(n_components=2, learning_rate='auto', init='pca') + # norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed) + # pca_norm_3D = TSNE(n_components=3, learning_rate='auto', init='pca') + # norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed) + + # pca_1D = TSNE(n_components=1, learning_rate='auto', init='pca') + # feat_1D = pca_1D.fit(rawSpectralData).fit_transform(rawSpectralData) + # pca_2D = TSNE(n_components=2, learning_rate='auto', init='pca') + # feat_2D = pca_2D.fit(rawSpectralData).fit_transform(rawSpectralData) + # pca_3D = TSNE(n_components=3, learning_rate='auto', init='pca') + # feat_3D = pca_3D.fit(rawSpectralData).fit_transform(rawSpectralData) + + + # fig, axs = plt.subplots(2, 3,figsize=(15,10)) + + # axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1) + # axs[0, 0].set_title(' norm') + # axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1) + # ax_3d = fig.add_subplot(2, 3, 3, projection='3d') + # ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01) + + + # axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1) + # axs[1, 0].set_title(' raw') + # axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1) + # ax_3d = fig.add_subplot(2, 3, 6, projection='3d') + # ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01) + # plt.savefig(output_folder/"t-SNE.png",dpi=100) + + # plt.close() + + + def plot_DBSCAN(eps=0.15, min_samples=10): + + db_norm = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData_normed) + + labels_norm = db_norm.labels_ + n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0) + n_noise_norm = list(labels_norm).count(-1) + + + max_i=0 + max_num=0 + for i in range(n_norm): + tmp=(labels_norm==i).sum() + if tmp>max_num: + max_i=i + max_num=tmp + + fig, axs = plt.subplots(2, 2,figsize=(10,10)) + for i in range(labels_norm.shape[0]): + if labels_norm[i]==max_i: + axs[0,0].plot(rawSpectralData_normed[i]) + axs[0,0].set_title(f'norm data w norm cluster {max_num},{n_norm},{n_noise_norm}') + + for i in range(labels_norm.shape[0]): + if labels_norm[i]==max_i: + axs[0,1].plot(rawSpectralData[i]) + axs[0,1].set_title('norm data w norm cluster') + + + + + db_raw = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData) + labels_raw = db_raw.labels_ + n_raw = len(set(labels_raw)) - (1 if -1 in labels_raw else 0) + n_noise_raw = list(labels_raw).count(-1) + + + max_i=0 + max_num=0 + for i in range(n_raw): + tmp=(labels_raw==i).sum() + if tmp>max_num: + max_i=i + max_num=tmp + + for i in range(labels_raw.shape[0]): + if labels_raw[i]==max_i: + axs[1,0].plot(rawSpectralData_normed[i]) + axs[1,0].set_title(f'norm data w norm cluster {max_num},{n_raw},{n_noise_raw}') + + for i in range(labels_raw.shape[0]): + if labels_raw[i]==max_i: + axs[1,1].plot(rawSpectralData[i]) + axs[1,1].set_title('norm data w norm cluster') + + plt.savefig(output_folder/f"DBSCAN_eps_{eps}_min_samples_{min_samples}.png",dpi=100) + + plt.close() + plot_DBSCAN(eps=0.15, min_samples=10) + # plot_DBSCAN(eps=100, min_samples=10) + + + + + + + + + + + # print(file_path.stem,rawSpectralData.shape) + + +def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None: + + files=list(raw_data_folder.glob("*.npz")) + + # plot_npz(files[0],output_folder ) + + p=multiprocessing.Pool(4) + + for i in range(len(files)): + p.apply_async(plot_npz, args=(files[i],output_folder)) + + print('Waiting for all subprocesses done...') + p.close() + p.join() + print('All subprocesses done.') + +if __name__=="__main__": + raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset") + output_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset_visual") + main(raw_dataset_folder/"test",output_folder/"test",) + main(raw_dataset_folder/"training",output_folder/"training",) + main(raw_dataset_folder/"validation",output_folder/"validation",) \ No newline at end of file diff --git a/src/scripts/rawbindata2rawdataset.py b/src/scripts/rawbindata2rawdataset.py new file mode 100644 index 0000000..e31718c --- /dev/null +++ b/src/scripts/rawbindata2rawdataset.py @@ -0,0 +1,276 @@ +''' +@File : pre_process.py +@Time : 2024/08/09 13:54:28 +@Author : Zhanpeng Yang +@Version : 0.0.1 +@Contact : zhpyang@outlook.com +@Desc : 进行数据预处理 +''' + +import pandas as pd +import logging +import os,glob +import numpy as np +import datetime +import pickle +import h5py + +class DataPreProcess: + """ + 包含所有数据预处理操作,包括 + 1. 从原始二进制数据,以钢厂 提供的Excel文件中提取出目标时刻附近的光谱 + 2. 从目标时刻附近的光谱选择合适、稳定的时间及空间点的光谱,作为feature + 3. 将feature 与 Label进行放缩 + + """ + def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,train_ratio=0.8,validate_ratio=0.1) -> None: + """初始化类,传入所有数据预处理需要的参数 + + Args: + raw_spectral_data_dir (_type_):原始光谱数据路径 + raw_labels_dir (_type_): 原始标签路径 + labels_name (_type_): 提取出的标签名 + dataset_dir (_type_): 生成的数据集文件夹 + choose_frame_spatial (_type_): 类对象,进行光谱特征挑选 + features_scaling (_type_): 类对象,对输入的特征进行放缩,使之直接输入神经网络 + labels_scaling (_type_): 类对象,对输出的的标签进行放缩 + train_ratio (float, optional): 训练集比例. Defaults to 0.8. + validate_ratio (float, optional): 验证集比例,剩余的即为测试集. Defaults to 0.1. + """ + self.raw_spectral_data_dir=raw_spectral_data_dir + self.raw_labels_dir=raw_labels_dir + self.dataset_dir=dataset_dir + self.labels_name=labels_name + + self.train_ratio=train_ratio + self.validate_ratio=validate_ratio + + #加载原始的标签 + self.raw_labels=pd.read_excel(raw_labels_dir) + + #加载原始光谱的缓存 + self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir) + + #随便加载一个csv文件,判断光谱数据的维度 + self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache) + + #正式开始原始数据转化为数据集 + self._raw_data_2_dataset() + + + + # def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame: + # """读取利用脚本处理后的钢厂给的excel文件所在文件夹(会扫描所有文件) + # 并选择指定的label名作为output + # 并去除值为NaN或0的行 + + # Args: + # raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径 + # labels_name (list): 指定的作为Label的列 + + # Returns: + # pd.DataFrame: 返回所有筛选后的炉次数据 + # """ + + # raw_labels=None + # for name in os.listdir(raw_labels_dir): + # tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name)) + + # choosed_column=["TSC_start_time","TSC_end_time"] + # choosed_column=choosed_column+labels_name + + + # #只选出我们想要的部分作为标签 + # tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column] + + # # 选出有NULL的行 + # null_rows=tmp_raw_labels.isnull().any(axis=1) + # # # 选出有0的行 + # zeros_rows=(tmp_raw_labels==0).any(axis=1) | (tmp_raw_labels=='0').any(axis=1) + + # # # 每一行但凡有NULL或者0都给删了 + # selected_rows=~(null_rows|zeros_rows) + # tmp_raw_labels=tmp_raw_labels[selected_rows] + + # if raw_labels is None: + # raw_labels=tmp_raw_labels + # else: + # raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0) + # logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces") + # logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total") + + # return raw_labels + def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list: + """生成所有原始光谱数据文件的缓存,包括每个文件记录的开始及结束时间,目的为加快后面读取原始数据的速度 + + Args: + raw_spectral_data_dir (str): 原始光谱所在路径 + + Returns: + list: 缓存,其中有多少个成员,就有多少个原始数据文件,每个成员包括格式为datetime的开始及结束时间,及文件路径 + """ + spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv")) + cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl")) + + update_flag=False + + + #不存在缓存文件,以及缓存文件中数据文件个数与文件夹中的文件个数不一致,均重新生成 + if len(cache_file_paths)==0: + logging.debug(f"Raw spectral data cache is not existed! Generating") + update_flag=True + elif len(cache_file_paths)==1: + with open(cache_file_paths[0],"rb") as f: + raw_spectral_data_cache=pickle.load(f) + if len(raw_spectral_data_cache) !=len(spectral_file_paths): + logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}") + update_flag=True + else: + logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}") + + if update_flag: + spectral_file_paths.sort() + raw_spectral_data_cache=[] + for file in spectral_file_paths: + tmp_info={} + tmp_data=np.loadtxt(file, delimiter=",") + start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000) + end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000) + tmp_info["start_t"]=start_t + tmp_info["end_t"]=end_t + tmp_info["file_path"]=file + raw_spectral_data_cache.append(tmp_info) + + with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f: + pickle.dump(raw_spectral_data_cache,f) + + return raw_spectral_data_cache + def _get_spectral_spatial_dim(self,raw_spectral_data_cache): + data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64) + if data[0,2]==229376: + spectral_dim =224 + spatial_dim=512 + if data[0,2]==917504: + spectral_dim =448 + spatial_dim=1024 + return spectral_dim, spatial_dim + + def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray: + """获取从start_time到end_time的光谱数据 + + Args: + start_time (datetime.datetime): 开始时间 + end_time (datetime.datetime): 结束时间 + + Returns: + np.ndarray: 原始光谱数据 + """ + + + def get_spectral_data_per_file(file_path,s_t,e_t): + data=np.loadtxt(file_path, delimiter=",").astype(np.uint64) + + if s_t is not None: + tmp_s=datetime.datetime.timestamp(s_t)*1000 + tmp_s_index=0 + for i in range(data.shape[0]-1): + if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s: + tmp_s_index=i + break + else: + tmp_s_index=0 + if e_t is not None: + tmp_e=datetime.datetime.timestamp(e_t)*1000 + tmp_e_index=data.shape[0] + for i in range(tmp_s_index,data.shape[0]-1): + if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e: + tmp_e_index=i + break + else: + tmp_e_index=data.shape[0] + + with open(file_path[:-3]+"bin", "rb") as f: + f.seek(data[tmp_s_index,1]) + d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2])) + d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim) + return data[tmp_s_index:tmp_e_index,0],d + + timestamps=None + raw_spectral_data=None + for tmp_info in self.raw_spectral_data_cache: + tmp_data=None + if start_timetmp_info["end_t"] and end_timetmp_info["end_t"]: + # 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据 + tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None) + elif start_time>tmp_info["start_t"] and end_timetmp_info["start_t"] and start_timetmp_info["end_t"]: + # 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据 + tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None) + if tmp_data is not None: + if raw_spectral_data is None: + timestamps=tmp_time_stamp + raw_spectral_data=tmp_data + else: + timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0) + raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0) + return timestamps,raw_spectral_data + + def _raw_data_2_dataset(self): + + save_dir=os.path.join(self.dataset_dir) + + #第一步,进行特征时间与空间点选取,并保存 + + pre_dataset_save_dir=os.path.join(save_dir,"raw_dataset") + + if not os.path.exists(pre_dataset_save_dir): + os.makedirs(pre_dataset_save_dir) + + #数据路径不存在就重新生成,如果存在,就跳过这部分。 + # !!!!!!!!!!因此需要额外注意,如果有新数据,需要把dataset下的文件夹都删除,相当于删除缓存,重新生成 + for i in range(self.raw_labels.shape[0]): + + start_time,end_time=self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10),self.raw_labels.iloc[i]["TSC_end_time"] + timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time) + + logging.debug(f"{self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10)},{self.raw_labels.iloc[i]["TSC_end_time"]}") + if raw_spectral_data is not None: + #获取到的数据帧率大于2才开始记录 + if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds(): + logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames") + + # raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data) + np.savez(os.path.join(pre_dataset_save_dir,f"{self.raw_labels.iloc[i]["Furnace_Number"]}.npz",),furnaceNumber=self.raw_labels.iloc[i]["Furnace_Number"],measureStartDatetime=self.raw_labels.iloc[i]["TSC_start_time"].strftime('%Y-%m-%d %H:%M:%S'),measureEndDatetime=self.raw_labels.iloc[i]["TSC_end_time"].strftime('%Y-%m-%d %H:%M:%S'),timestamps=timestamps,rawSpectralData=raw_spectral_data,rawLabels=self.raw_labels.iloc[i][self.labels_name].to_numpy(),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"]) + # else: + # logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}") + + +if __name__=="__main__": + + logging.basicConfig(level = logging.DEBUG) + + + raw_data_dir="/data/SEMS-model-training/old_rawdata" + labels_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx" + dataset_dir="/data/SEMS-model-training/dataset" + labels_name=["TSC_T","TSC_C","TSC_P","TSC_S","TSC_Mn","TSC_Ni","TSC_Mo","TSC_Cr"] + + + + + # from choose_frame_spatial.mean import ChooseFrameSpatial + # from features_scaling.max_min import FeatureScaling + # from labels_scaling.max_min import LabelScaling + # choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30]) + # features_scaling=FeatureScaling() + # labels_scaling=LabelScaling() + + + + data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir) \ No newline at end of file