stable version 准备接入不同设备训练不同模型

2024-11-07 14:37:51 +08:00 · 2024-11-07 14:37:51 +08:00 · 6e333fe733
commit 6e333fe733
parent cfac5fd675
66 changed files with 2650 additions and 591 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,4 @@
 dataset
-rawdata
+old_rawdata
 labels
 logs
-
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,15 @@
 # Change Log

-## v0.1.0.20240910_alpha
+## v0.0.2.20241015_alpha
+
+# [⭐️Features]
+
+- 从 MySQL 数据库获取数据
+
+# [🔄Changed]
+
+- 变更数据处理流程，先从数据库获取数据划分训练集测试集，得到中间文件，再进行选择空间点等后续处理
+
+## v0.0.1.20240910_alpha

 - 跑通
--- a/TODO.md
+++ b/TODO.md
@ -0,0 +1,15 @@
+-[ ] 调研所有可调整超参数
+
+-[ ] 调研所有回归任务的 loss 设计
+
+-[ ] 调研所有可以用以回归任务的模型架构，及其可调整参数
+
+-[ ] 调研跳出局部最优解的方法
+
+-[x] 将数据库中的数据转化成训练集，验证集，测试集
+
+-[x] 将训练集，验证集用不同的预处理、模型、超参数得到最优的模型
+
+-[x] 将最优模型极其对应的超参数保存，并验证测试集得到测试误差
+
+-[x] 不同的参数使用不同的模型
--- a/configs/main_train.yaml
+++ b/configs/main_train.yaml
@ -5,16 +5,16 @@ defaults:

 task_name: main

-raw_spectral_data_dir: /code/admin/20240806-NanEr-5-8-data/rawdata
-raw_labels_dir: /code/admin/20240806-NanEr-5-8-data/labels/NanEr
-dataset_dir: /code/admin/20240806-NanEr-5-8-data/dataset
-labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
+# raw_spectral_data_dir: /data/SEMS-model-training/rawdata
+# raw_labels_dir: /data/SEMS-model-training/labels/NanEr
+dataset_dir: /data/SEMS-model-training/dataset
+# labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
 #是

-
 dataset:
-  train_ratio: 0.8
-  validate_ratio: 0.1
-  num_worker: 128
+  # train_ratio: 0.8
+  # validate_ratio: 0.1
+  num_worker: 0
 train:
-  max_epoch: 10000
+  max_epoch: 5000
+  checkpoint_interval: 100
--- a/configs/ray_tune/default.yaml
+++ b/configs/ray_tune/default.yaml
@ -1,40 +1,37 @@
 run:
  num_samples: 1
  resources_per_trial:
-    cpu: 128
-    gpu: 1
+    cpu: 4
+    gpu: 0.1
 scheduler:
-  _target_: ray.tune.schedulers.ASHAScheduler
-  metric: val_loss
-  mode: min
-  max_t: 20000
-  grace_period: 1
-  reduction_factor: 2
+  _target_: ray.tune.schedulers.FIFOScheduler
+  # _target_: ray.tune.schedulers.ASHAScheduler
+  # metric: val_loss
+  # mode: min
+  # max_t: 20000
+  # grace_period: 1
+  # reduction_factor: 2
 config:
  choose_frame_spatial:
    _target_: ray.tune.grid_search
    values:
-      - _target_: data.choose_frame_spatial.mean.ChooseFrameSpatial
-        interval: [-3,0]
-
+      - _target_: data.choose_frame_spatial.DBSCAN.ChooseFrameSpatial ##DBSCAN_data_augmentation

  features_scaling:
    _target_: ray.tune.grid_search
    values:
-      - _target_: data.features_scaling.max_min.FeatureScaling
-
+      - _target_: data.features_scaling.standardization.FeatureScaling #

  labels_scaling:
    _target_: ray.tune.grid_search
    values:
-      - _target_: data.labels_scaling.max_min.LabelScaling
+      - _target_: data.labels_scaling.standardization.LabelScaling #standardization

  model:
    _target_: ray.tune.grid_search
    values:
-      # - _target_: model.FCNN.FCNN
-      # - _target_: model.CNN_LSTM_FCNN.CNN_LSTM_FCNN
-      - _target_: model.CNN1D_FCNN.CNN1D_FCNN
+      - _target_: model.DRSN-CW.SpectralModel
+      # - _target_: model.FCNN_DBSCAN_small.SpectralModel
  criterion:
    _target_: ray.tune.grid_search
    values:
@ -42,10 +39,15 @@ config:
  optimizer:
    _target_: ray.tune.grid_search
    values:
+      - _target_: torch.optim.Adam
+        _partial_: true
+        lr: 0.000001
      - _target_: torch.optim.Adam
        _partial_: true
        lr: 0.0001
+
  batch_size:
    _target_: ray.tune.grid_search
    values:
+      - 1024
      - 128
--- a/src/data/pycache/pre_process.cpython-312.pyc
+++ b/src/data/pycache/pre_process.cpython-312.pyc
--- a/src/data/pycache/pre_process_backup.cpython-312.pyc
+++ b/src/data/pycache/pre_process_backup.cpython-312.pyc
--- a/src/data/pycache/spectral_dataset.cpython-312.pyc
+++ b/src/data/pycache/spectral_dataset.cpython-312.pyc
--- a/src/data/choose_frame_spatial/DBSCAN.py
+++ b/src/data/choose_frame_spatial/DBSCAN.py
@ -0,0 +1,142 @@
+'''
+@File    :   mean.py
+@Time    :   2024/08/12 13:53:36
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
+'''
+
+
+import datetime
+import numpy as np
+from sklearn.cluster import DBSCAN
+from pathlib import Path
+
+class ChooseFrameSpatial:
+    def __init__(self,eps=0.15,min_samples=10):
+        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
+        """
+        self.eps=eps
+        self.min_samples=min_samples
+        self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改
+    
+        self.file_path = Path(__file__).absolute()
+
+
+
+    def load_state_dict(self,state):
+        self.eps=state["eps"]
+        self.min_samples=state["min_samples"]
+    def state_dict(self):
+        return {"eps":self.eps,"min_samples":self.min_samples}
+    
+
+   
+    def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+        """
+        获取指定时间的内的光谱数据
+        """
+
+        if isinstance(measureStartDatetime.item(),str):
+            start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000   #测量开始时间（此项目中时间戳均以ms为单位）
+            end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000    #测量开始时间+3秒
+        else:
+            start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000 
+            end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000 
+
+        start_index=0
+        end_index=timestamps.shape[0]
+        for i in range(1,timestamps.shape[0]):
+            if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
+                # print(f"开始索引为{i}")
+                start_index=i
+            if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
+                # print(f"结束索引为{i}")
+                end_index=i
+
+
+        return rawdata[start_index:end_index,...]
+
+
+
+
+    def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+
+        rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
+
+
+        rawSpectralData=rawSpectralData.transpose(0, 2, 1)
+        rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
+
+        rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
+        db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed)
+
+        labels_norm = db_norm.labels_
+        n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
+
+
+        if n_norm==0:
+            return None
+
+        max_i=0
+        max_num=0
+        for i in range(n_norm):
+            tmp=(labels_norm==i).sum()
+            if tmp>max_num:
+                max_i=i
+                max_num=tmp
+
+
+        selected_data=rawSpectralData_normed[labels_norm==max_i,:]
+        selected_data=np.mean(selected_data,axis=0)
+
+        # space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
+        # space_num= 10  #空间上平均光强合适，且方差最小的10个点
+        # light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
+        #                             & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
+        #                             )         
+        # if light_indices[0].shape[0] <  space_num:
+        #     print('No Enough Usable Space Point Found!')
+        #     return None
+
+        # intensity_data = selected_data[:,:,light_indices[0]]
+
+        # # spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
+        # # top_10_indices = np.argsort(spatial_variance)[:space_num]
+        # space_selected_data = intensity_data[:,:,:]
+
+        # space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0)
+
+        # space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data))
+
+        # # Savitzky-Golay filter
+        # data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0)
+
+        # spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11,
+        # 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41, 
+        # 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31, 
+        # 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140, 
+        # 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
+
+
+        # selected_data = data_filtered[spec_indices]
+
+       
+        return selected_data
+    
+if __name__ =="__main__":
+    raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True)
+    print(raw_data.keys())
+
+    print(f"炉次号\t：{raw_data["furnaceNumber"]}")
+    print(f"开始测量时间\t：{raw_data["measureStartDatetime"]}")
+    print(f"结束测量时间\t：{raw_data["measureEndDatetime"]}")
+    print(f"时间戳维度\t：{raw_data["timestamps"].shape}")
+    print(f"原始光谱数据维度\t：{raw_data["rawSpectralData"].shape}")
+
+    tmp=ChooseFrameSpatial()
+    output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
+
+
+    print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
--- a/src/data/choose_frame_spatial/DBSCAN_data_augmentation.py
+++ b/src/data/choose_frame_spatial/DBSCAN_data_augmentation.py
@ -0,0 +1,102 @@
+'''
+@File    :   mean.py
+@Time    :   2024/08/12 13:53:36
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
+'''
+
+
+import datetime
+import numpy as np
+from sklearn.cross_decomposition import PLSRegression
+from scipy.signal import savgol_filter
+from sklearn.cluster import DBSCAN
+
+class ChooseFrameSpatial:
+    def __init__(self,eps=0.15,min_samples=10):
+        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
+        """
+        self.eps=eps
+        self.min_samples=min_samples
+        self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}_eps_{eps}_min_samples_{min_samples}" #此项当没有新增超参数时不需要修改
+
+   
+    def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+        """
+        获取指定时间的内的光谱数据
+        """
+
+        if isinstance(measureStartDatetime.item(),str):
+            start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000   #测量开始时间（此项目中时间戳均以ms为单位）
+            end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 1000    #测量开始时间+3秒
+        else:
+            start_timestamp=measureStartDatetime.item().timestamp()*1000 - 1000 
+            end_timestamp=measureStartDatetime.item().timestamp()*1000 + 1000 
+
+        start_index=0
+        end_index=timestamps.shape[0]
+        for i in range(1,timestamps.shape[0]):
+            if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
+                # print(f"开始索引为{i}")
+                start_index=i
+            if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
+                # print(f"结束索引为{i}")
+                end_index=i
+
+
+        return rawdata[start_index:end_index,...]
+
+
+
+
+    def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+
+        rawSpectralData=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
+
+
+        rawSpectralData=rawSpectralData.transpose(0, 2, 1)
+        rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
+
+        rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
+        db_norm = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(rawSpectralData_normed)
+
+        labels_norm = db_norm.labels_
+        n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
+
+
+        if n_norm==0:
+            return None
+
+        max_i=0
+        max_num=0
+        for i in range(n_norm):
+            tmp=(labels_norm==i).sum()
+            if tmp>max_num:
+                max_i=i
+                max_num=tmp
+
+
+        selected_data=rawSpectralData_normed[labels_norm==max_i,:]
+        # selected_data=np.mean(selected_data,axis=0)
+
+
+       
+        return selected_data
+    
+if __name__ =="__main__":
+    raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/test/24609591.npz",allow_pickle=True)
+    print(raw_data.keys())
+
+    print(f"炉次号\t：{raw_data["furnaceNumber"]}")
+    print(f"开始测量时间\t：{raw_data["measureStartDatetime"]}")
+    print(f"结束测量时间\t：{raw_data["measureEndDatetime"]}")
+    print(f"时间戳维度\t：{raw_data["timestamps"].shape}")
+    print(f"原始光谱数据维度\t：{raw_data["rawSpectralData"].shape}")
+
+    tmp=ChooseFrameSpatial()
+    output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
+
+
+    print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
--- a/src/data/choose_frame_spatial/pycache/DBSCAN.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/DBSCAN.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/DBSCAN_data_augmentation.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/DBSCAN_data_augmentation.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/FCNN_CARS.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/FCNN_CARS.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/mean.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/mean.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/mean_CARS100_3.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/mean_CARS100_3.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/mean_chose_top_10_spatial.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/mean_chose_top_10_spatial.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/meanwovar.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/meanwovar.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/meanwovar_CARS.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/meanwovar_CARS.cpython-312.pyc
--- a/src/data/choose_frame_spatial/mean.py
+++ b/src/data/choose_frame_spatial/mean.py
@ -1,100 +0,0 @@
-'''
-@File    :   mean.py
-@Time    :   2024/08/12 13:53:36
-@Author  :   Zhanpeng Yang
-@Version :   0.0.1
-@Contact :   zhpyang@outlook.com
-@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
-'''
-
-
-
-import datetime
-import numpy as np
-class ChooseFrameSpatial:
-    def __init__(self, interval=[-3,0]):
-        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
-
-        Args:
-            interval (list, optional): 此方法依赖的光谱时间范围，默认[-30,30]，即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
-        """
-        self.description=f"{str(self.__class__).split(".")[2]}_{interval[0]}_{interval[1]}"
-        # print(self.description)
-        self.interval=interval
-
-    def get_data_interval(self, start_time, end_time ):
-
-        return start_time+datetime.timedelta(seconds=self.interval[0]),start_time+datetime.timedelta(seconds=self.interval[1])
-
-    def run(self,timestamps,rawdata):
-        ############# 空间数据压缩 ######################
-        space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
-
-        space_num= 10  #选取方差最小的点的个数
-
-        light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
-                                    & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
-                                    )         
-                                    
-        if light_indices[0].shape[0] <  space_num:
-            print('No Enough Usable Space Point Found!')
-            return None
-
-        intensity_data = rawdata[:,:,light_indices[0]]
-
-        spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
-        top_10_indices = np.argsort(spatial_variance)[:space_num]
-        space_selected_data = intensity_data[:, :, top_10_indices]
-
-        ############# 时间数据压缩 ######################   
-        # framerate = 24
-        # timewindow = 2
-
-        # time_data_intensity = np.sum(np.mean(space_selected_data,axis=2),axis=1)
-        
-        # min_var = float('inf')
-        # min_index = 0
-
-        # for i in range(len(time_data_intensity)-framerate*timewindow-1):
-        #     window_var = np.var(time_data_intensity[i:i+framerate*timewindow])
-        #     if window_var < min_var:
-        #         min_var = window_var
-        #         min_index = i
-
-        # selected_data = space_selected_data[min_index:min_index+framerate*timewindow,:,:]
-
-        selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
-        # print("timewindow_begin=",min_index)
-
-        # if (result_dir is not None) and (filenum is not None) :
-            
-        #     for i in range (selected_data.shape[2]):
-
-        #         Z=selected_data[:,:,i]
-        #         x=np.linspace(400,1000,Z.shape[1])
-        #         y=selected_data[:,1,i]
-
-        #         x, y = np.meshgrid(x, y)
-
-        #         fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
-                
-        #         surf = ax.plot_surface(x, y, Z, cmap=cm.Blues,
-        #                         linewidth=0, antialiased=False)
-
-        #         ax.view_init(elev=30, azim=-60)  # 更改视角
-
-        #         ax.set_zlim(0, 4095)
-        #         ax.set_zlabel("Light Intensity")
-        #         ax.set_xlabel("Spectral Band(nm)")
-        #         ax.set_ylabel("Time (Serial Number)")
-        #         plt.savefig(f"{result_dir}{ filenum} file {i}-th spatial_point_line.png")
-        #         plt.close()
-
-        return selected_data
-    
-if __name__ =="__main__":
-    timpestamp=np.zeros((600,))
-    input_data=np.random.random((600,224,512))*4095
-    tmp=ChooseFrameSpatial()
-    output=tmp.run(timpestamp,input_data)
-    print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
--- a/src/data/choose_frame_spatial/mean_CARS100_3.py
+++ b/src/data/choose_frame_spatial/mean_CARS100_3.py
@ -0,0 +1,125 @@
+'''
+@File    :   mean.py
+@Time    :   2024/08/12 13:53:36
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
+'''
+
+
+
+import datetime
+import numpy as np
+from sklearn.cross_decomposition import PLSRegression
+from scipy.signal import savgol_filter
+from scipy.ndimage import median_filter
+
+class ChooseFrameSpatial:
+    def __init__(self):
+        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
+        """
+
+        self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改
+
+        print(self.description)
+   
+    def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+        """
+        获取指定时间的内的光谱数据
+        """
+
+        
+        if isinstance(measureStartDatetime.item(),str):
+            start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000   #测量开始时间（此项目中时间戳均以ms为单位）
+            end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 + 3000    #测量开始时间+3秒
+        else:
+            start_timestamp=measureStartDatetime.item().timestamp()*1000
+            end_timestamp=measureStartDatetime.item().timestamp()*1000 + 3000 
+
+        start_index=0
+        end_index=timestamps.shape[0]
+        for i in range(1,timestamps.shape[0]):
+            if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
+                # print(f"开始索引为{i}")
+                start_index=i
+            if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
+                # print(f"结束索引为{i}")
+                end_index=i
+
+
+        return rawdata[start_index:end_index,...]
+
+
+
+
+    def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+
+        selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
+        
+        
+
+        space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
+        space_num= 10  #空间上平均光强合适，且方差最小的10个点
+        light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
+                                    & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
+                                    )         
+        if light_indices[0].shape[0] <  space_num:
+            print('No Enough Usable Space Point Found!')
+            return None
+
+        intensity_data = selected_data[:,:,light_indices[0]]
+
+        # spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
+        # top_10_indices = np.argsort(spatial_variance)[:space_num]
+
+        time_window = 11
+        time_steps, wavelengths, spatial_points = intensity_data.shape
+        filtered_data = np.zeros_like(intensity_data)
+        for wavelength in range(wavelengths):
+            for spatial_point in range(spatial_points):
+                # 提取时间序列
+                time_series = intensity_data[:, wavelength, spatial_point]
+                # 应用中值滤波
+                filtered_time_series = median_filter(time_series, size=time_window)
+                # 将滤波后的时间序列放回原位置
+                filtered_data[:, wavelength, spatial_point] = filtered_time_series
+                
+
+        space_selected_data = filtered_data[:,:,:]
+
+        space_selected_data = np.mean(np.mean(space_selected_data,axis=2),axis=0)
+
+        # space_selected_data = (space_selected_data - np.min(space_selected_data)) / (np.max(space_selected_data) - np.min(space_selected_data))
+
+        # # Savitzky-Golay filter
+        # data_filtered = savgol_filter(space_selected_data, window_length=31, polyorder=2, axis=0)
+
+        # spec_indices = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136, 11,
+        # 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204, 85, 104, 41, 
+        # 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18, 17, 96, 167, 192, 201, 31, 
+        # 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51, 147, 58, 182, 49, 119, 13, 179, 140, 
+        # 105, 45, 55, 33, 73, 111, 97, 194, 121, 89, 38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
+
+
+        # selected_data = data_filtered[spec_indices]
+
+        # print(space_selected_data.shape)
+        return space_selected_data
+        # return selected_data
+    
+if __name__ =="__main__":
+    raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True)
+    print(raw_data.keys())
+
+    print(f"炉次号\t：{raw_data["furnaceNumber"]}")
+    print(f"开始测量时间\t：{raw_data["measureStartDatetime"]}")
+    print(f"结束测量时间\t：{raw_data["measureEndDatetime"]}")
+    print(f"时间戳维度\t：{raw_data["timestamps"].shape}")
+    print(f"原始光谱数据维度\t：{raw_data["rawSpectralData"].shape}")
+
+    tmp=ChooseFrameSpatial()
+    output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
+
+
+    print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
--- a/src/data/choose_frame_spatial/mean_chose_top_10_spatial.py
+++ b/src/data/choose_frame_spatial/mean_chose_top_10_spatial.py
@ -0,0 +1,83 @@
+'''
+@File    :   mean.py
+@Time    :   2024/08/12 13:53:36
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
+'''
+
+
+
+import datetime
+import numpy as np
+
+class ChooseFrameSpatial:
+    def __init__(self,):
+        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
+        """
+
+        
+        self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}" #此项当没有新增超参数时不需要修改
+
+        print(self.description)
+   
+    def get_specific_data(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+        """
+        获取指定时间的内的光谱数据
+        """
+        start_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 -1000  #测量开始时间-1秒（此项目中时间戳均以ms为单位）
+        end_timestamp=datetime.datetime.strptime(measureStartDatetime.item(), "%Y-%m-%d %H:%M:%S").timestamp()*1000 +1000   #测量开始时间+1秒
+
+        for i in range(1,timestamps.shape[0]):
+            if timestamps[i]>=start_timestamp and timestamps[i-1]<=start_timestamp:
+                # print(f"开始索引为{i}")
+                start_index=i
+            if timestamps[i]>=end_timestamp and timestamps[i-1]<=end_timestamp:
+                # print(f"结束索引为{i}")
+                end_index=i
+
+        return rawdata[start_index:end_index,...]
+
+    def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+
+        selected_data=self.get_specific_data(measureStartDatetime,measureEndDatetime,timestamps,rawdata)
+        
+
+
+
+        space_intensity_mean = np.mean(np.sum(selected_data, axis=1), axis=0)
+        space_num= 10  #空间上平均光强合适，且方差最小的10个点
+        light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
+                                    & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
+                                    )         
+        if light_indices[0].shape[0] <  space_num:
+            print('No Enough Usable Space Point Found!')
+            return None
+
+        intensity_data = selected_data[:,:,light_indices[0]]
+
+        spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
+        top_10_indices = np.argsort(spatial_variance)[:space_num]
+        selected_data = intensity_data[:, :, top_10_indices]
+
+        
+
+       
+        return selected_data
+    
+if __name__ =="__main__":
+    raw_data=np.load("/data/SEMS-model-training/dataset/raw_dataset/training/24604919.npz",allow_pickle=True)
+    print(raw_data.keys())
+
+    print(f"炉次号\t：{raw_data["furnaceNumber"]}")
+    print(f"开始测量时间\t：{raw_data["measureStartDatetime"]}")
+    print(f"结束测量时间\t：{raw_data["measureEndDatetime"]}")
+    print(f"时间戳维度\t：{raw_data["timestamps"].shape}")
+    print(f"原始光谱数据维度\t：{raw_data["rawSpectralData"].shape}")
+
+    tmp=ChooseFrameSpatial()
+    output=tmp.run(raw_data["measureStartDatetime"],raw_data["measureEndDatetime"],raw_data["timestamps"],raw_data["rawSpectralData"])
+
+
+    print(f"输入数据维度:{raw_data["rawSpectralData"].shape}\n输出数据维度:{output.shape}")
--- a/src/data/choose_frame_spatial/mean_var_weight_reduction.py
+++ b/src/data/choose_frame_spatial/mean_var_weight_reduction.py
@ -1,132 +0,0 @@
-'''
-@File    :   mean.py
-@Time    :   2024/08/12 13:53:36
-@Author  :   Zhanpeng Yang
-@Version :   0.0.1
-@Contact :   zhpyang@outlook.com
-@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
-'''
-
-
-
-
-import datetime
-import numpy as np
-class ChooseFrameSpatial:
-    def __init__(self, interval=[-30,30],overexposion_threshold = 1,weight_mean_var = 0.5,space_num= 10, time_num = 9,framerate = 4,timewindow_time =  1,Type=6):
-        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
-
-        Args:
-            interval (list, optional): 此方法依赖的光谱时间范围，默认[-30,30]，即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
-        """
-        # print(str(self.__class__).split("."))
-        self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
-
-        # print(self.description)
-
-        self.overexposion_threshold = overexposion_threshold  #过曝阈值：有多少时间点过曝时，判定为这个空间点过曝
-        self.weight_mean_var =weight_mean_var  #强度/方差选取权重，默认平均光强权重为1，此权重为方差权重，取值为[0，∞）
-        self.space_num= space_num  #选取空间点数量
-        self.time_num = time_num  #选取时间点数量
-        self.framerate = framerate  #采样帧率
-        self.timewindow = framerate * timewindow_time  #选取时间窗口为1s
-
-
-        self.interval=interval
-        self.Type=Type
-
-    def get_data_interval(self, start_time, end_time ):
-
-        return start_time+datetime.timedelta(seconds=self.interval[0]),end_time+datetime.timedelta(seconds=self.interval[1])
-
-    def Overexposed_spatial_point_detection(self,inputdata, timethreshold): 
-        #判别一个空间点是否过曝，inputdata是（time，wavelength）二维数组，timethreshold是时间阈值，有多少个时间点过曝则认为这个点过曝
-        # 如果过曝返回False，不过曝返回True
-
-        row_max_values= np.max(inputdata, axis=1)
-        overexposed_rows = np.sum(row_max_values >= 4095)
-        if overexposed_rows > timethreshold:
-            return False
-        else:
-            return True
-    def run(self,timestamps,rawdata):
-        # 降维方法：空间上选取强度达到一定的阈值且方差最小的点，时间上把整个时间段划分并选取各段权重最高的1s进行平均，Type=1取前1/3，Type=2取中部1/3，Type=3取最后1/3，Type=4取全部
-
-
-        ############# 空间过曝点去除 ######################
-        unoverposed_indices = [i for i in range(rawdata.shape[2]) if self.Overexposed_spatial_point_detection(rawdata[:, :, i],self.overexposion_threshold)]
-        rawdata = rawdata[:,:,unoverposed_indices]
-
-        ############# 空间数据压缩 ######################
-        space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
-        space_intensity_var = np.var(np.sum(rawdata, axis=1), axis=0)
-        combined_score = (space_intensity_mean - np.min(space_intensity_mean)) / (np.max(space_intensity_mean) - np.min(space_intensity_mean)) \
-            - self.weight_mean_var * (space_intensity_var - np.min(space_intensity_var)) / (np.max(space_intensity_var) - np.min(space_intensity_var)) 
-
-        sorted_indices = np.argsort(combined_score)[::-1]
-        space_selected_data = rawdata[:,:,sorted_indices[:self.space_num]]
-        
-        ############# 时间数据压缩 ######################
-        space_selected_data_intensity = np.sum(space_selected_data,axis=1)
-
-        #按照时间把强度数据拆分成num_time份，每一份维度为（总时间点数/time_num, space_num）
-        time_length = space_selected_data_intensity.shape[0]
-        chunk_size = time_length // self.time_num
-        remainder = time_length % self.time_num
-        start_index = 0
-
-        time_selected_data =None
-
-        for i in range(self.time_num):
-            end_index = start_index + chunk_size + (1 if i < remainder else 0)
-            chunk = space_selected_data_intensity[start_index:end_index, :]        
-
-            window_var = np.empty(0)
-            window_mean = np.empty(0)
-
-            for j in range(len(chunk)-self.timewindow-1):
-                window_var = np.concatenate((window_var, np.expand_dims( np.sum(np.var(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
-                window_mean = np.concatenate((window_mean, np.expand_dims( np.sum(np.mean(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)      
-
-            combined_score_time = (window_mean - np.min(window_mean)) / (np.max(window_mean) - np.min(window_mean)) \
-                - self.weight_mean_var * (window_var - np.min(window_var)) / (np.max(window_var) - np.min(window_var)) 
-            sorted_indices = np.argsort(combined_score_time)[::-1]
-
-            time_window_data = np.mean(space_selected_data[start_index+sorted_indices[0]:start_index+sorted_indices[0]+self.timewindow, :, :],axis=0)
-            # print(time_selected_data.shape,time_window_data.shape,np.expand_dims( time_window_data,0).shape)
-            if time_selected_data is None:
-
-                time_selected_data=np.expand_dims( time_window_data,0)
-            else:
-                time_selected_data = np.concatenate((time_selected_data, np.expand_dims( time_window_data,0)), axis=0)
-
-            start_index = end_index
-
-        if self.Type == 1:
-            print("time_selected_data[前1/3].shape: ", time_selected_data[:self.time_num//3,:,:].shape)
-            return time_selected_data[:self.time_num//3,:,:]
-        elif self.Type == 2:
-            print("time_selected_data[中1/3].shape: ", time_selected_data[self.time_num//3:2*self.time_num//3,:,:].shape)
-            return time_selected_data[self.time_num//3:2*self.time_num//3,:,:]
-        elif self.Type == 3:
-            print("time_selected_data[后1/3].shape: ", time_selected_data[2*self.time_num//3:,:,:].shape)
-            return time_selected_data[2*self.time_num//3:,:,:]
-        elif self.Type == 4:
-            print("time_selected_data[前2/3].shape: ", time_selected_data[:2*self.time_num//3,:,:].shape)
-            return time_selected_data[:2*self.time_num//3,:,:]
-        elif self.Type == 5:
-            print("time_selected_data[后2/3].shape: ", time_selected_data[self.time_num//3:,:,:].shape)
-            return time_selected_data[self.time_num//3:,:,:]
-        elif self.Type == 6:
-            print("time_selected_data.shape: ", time_selected_data.shape)
-            return time_selected_data
-        else:
-            print("Type is not 1, 2, 3, 4, 5 or 6 !")
-            return None
-    
-if __name__ =="__main__":
-    timpestamp=np.zeros((600,))
-    input_data=np.random.random((600,224,512))*4095
-    tmp=ChooseFrameSpatial()
-    output=tmp.run(timpestamp,input_data)
-    print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
--- a/src/data/choose_frame_spatial/meanwovar_CARS.py
+++ b/src/data/choose_frame_spatial/meanwovar_CARS.py
@ -0,0 +1,78 @@
+'''
+@File    :   mean.py
+@Time    :   2024/08/12 13:53:36
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
+'''
+
+
+
+import datetime
+import numpy as np
+from scipy.signal import savgol_filter
+
+class ChooseFrameSpatial:
+    def __init__(self,):
+        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
+
+        Args:
+            interval (list, optional): 此方法依赖的光谱时间范围，默认[-30,30]，即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
+        """
+
+        
+
+        # tmp=
+        # if len(tmp)>2:
+        #     self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
+        self.description=f"{str(self.__class__)[8:-2].split(".")[-2]}"
+
+        print(self.description)
+        # print(self.description)
+
+
+
+    def run(self,measureStartDatetime,measureEndDatetime,timestamps,rawdata):
+        ############# 空间数据压缩 ######################
+        space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
+
+        space_num= 10  #选取方差最小的点的个数
+
+        light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
+                                    & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
+                                    )         
+                                    
+        if light_indices[0].shape[0] <  space_num:
+            print('No Enough Usable Space Point Found!')
+            return None
+
+        intensity_data = rawdata[:,:,light_indices[0]]
+
+        spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
+        top_10_indices = np.argsort(spatial_variance)[:space_num]
+        space_selected_data = intensity_data[:, :, top_10_indices]
+
+
+        selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
+
+        selected_data = savgol_filter(selected_data, window_length=11, polyorder=2, axis=0)
+
+        top_features = [106, 127, 118, 0, 52, 23, 113, 77, 157, 175, 195, 24, 218, 108, 8, 211, 2, 136,
+    11, 36, 129, 109, 153, 188, 200, 196, 7, 19, 6, 198, 193, 221, 156, 187, 162, 204,
+    85, 104, 41, 16, 185, 125, 14, 149, 91, 138, 72, 146, 35, 53, 190, 148, 75, 18,
+    17, 96, 167, 192, 201, 31, 158, 183, 32, 40, 123, 145, 161, 27, 209, 216, 101, 51,
+    147, 58, 182, 49, 119, 13, 179, 140, 105, 45, 55, 33, 73, 111, 97, 194, 121, 89,
+    38, 12, 197, 173, 160, 131, 141, 37, 208, 47]
+        selected_data=selected_data[top_features]
+
+
+       
+        return selected_data
+    
+if __name__ =="__main__":
+    timpestamp=np.zeros((600,))
+    input_data=np.random.random((56,224,512))*4095
+    tmp=ChooseFrameSpatial()
+    output=tmp.run(0,0,timpestamp,input_data)
+    print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
--- a/src/data/features_scaling/pycache/do_nothing.cpython-312.pyc
+++ b/src/data/features_scaling/pycache/do_nothing.cpython-312.pyc
--- a/src/data/features_scaling/pycache/frame_max_min.cpython-312.pyc
+++ b/src/data/features_scaling/pycache/frame_max_min.cpython-312.pyc
--- a/src/data/features_scaling/pycache/global_max_min.cpython-312.pyc
+++ b/src/data/features_scaling/pycache/global_max_min.cpython-312.pyc
--- a/src/data/features_scaling/pycache/standardization.cpython-312.pyc
+++ b/src/data/features_scaling/pycache/standardization.cpython-312.pyc
--- a/src/data/features_scaling/do_nothing.py
+++ b/src/data/features_scaling/do_nothing.py
@ -0,0 +1,21 @@
+'''
+@File    :   max_min.py
+@Time    :   2024/08/12 14:02:59
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   对输入特征进行标准化放缩
+'''
+
+
+
+import numpy as np
+class FeatureScaling:
+    def __init__(self):
+        self.description=f"{str(self.__class__).split(".")[-2]}"
+        self.flag_get_global_info=False
+
+
+    def run( self,feature):
+        
+        return feature
--- a/src/data/features_scaling/frame_max_min.py
+++ b/src/data/features_scaling/frame_max_min.py
@ -0,0 +1,23 @@
+'''
+@File    :   max_min.py
+@Time    :   2024/08/12 14:02:59
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   对输入特征进行标准化放缩
+'''
+
+
+
+import numpy as np
+class FeatureScaling:
+    def __init__(self):
+        self.description=f"{str(self.__class__).split(".")[-2]}"
+        self.flag_get_global_info=False
+
+
+
+    def run( self,feature):
+        
+        feature=(feature-feature.min())/(feature.max()-feature.min())
+        return feature
--- a/src/data/features_scaling/global_max_min.py
+++ b/src/data/features_scaling/global_max_min.py
--- a/src/data/features_scaling/standardization.py
+++ b/src/data/features_scaling/standardization.py
@ -0,0 +1,72 @@
+'''
+@File    :   max_min.py
+@Time    :   2024/08/12 14:03:38
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   对输出标签进行标准化放缩
+'''
+
+
+
+import numpy as np
+from pathlib import Path
+class FeatureScaling:
+    def __init__(self):
+        self.description=f"{str(self.__class__).split(".")[-2]}"
+        self.flag_get_global_info=True
+        self._min=None
+        self._max=None
+        self._mean=None
+        self._var=None
+        self._num=0
+
+        self.file_path = Path(__file__).absolute()
+
+
+
+    def load_state_dict(self,state):
+        self._min=state["_min"]
+        self._max=state["_max"]
+        self._mean=state["_mean"]
+        self._var=state["_var"]
+        self._num=state["_num"]
+    def state_dict(self):
+        return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num}
+    
+    def get_global_info(self,feature):
+        feature=feature.astype(np.float32)
+        if self._max is  None:
+            self._min=np.zeros(feature.shape,dtype=np.float32)+999999999
+            self._max=np.zeros(feature.shape,dtype=np.float32)
+            self._mean=np.zeros(feature.shape,dtype=np.float32)
+            self._var=np.zeros(feature.shape,dtype=np.float32)
+            self._num=0
+            self.feature_shape=feature.shape
+
+        self._min=np.minimum(self._min,feature)
+        self._max=np.maximum(self._max,feature)
+        
+        if self._num>0:
+            self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(feature-self._mean)*(feature-self._mean)
+        #要先更新var再更新mean
+        self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*feature
+        self._num =self._num+1
+        
+        # print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max)
+
+
+    def run(self,feature):
+        # for i in range(label.shape[0]):
+        #     label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
+        feature=(feature-self._mean)/np.sqrt(self._var)
+
+        return  feature
+    # def reverse(self,feature):
+    #     # for i in range(labels.shape[1]):
+    #     #     labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
+
+    #     labels=labels*np.sqrt(self._var)+self._mean
+    #     return labels
+
+        
--- a/src/data/generate_raw_dataset_from_MySQL.py
+++ b/src/data/generate_raw_dataset_from_MySQL.py
@ -0,0 +1,202 @@
+import os
+import shutil
+from pathlib import Path
+import pymysql.cursors
+
+import numpy as np
+import gzip
+import msgpack
+
+# oldRawDatasetDir=
+
+def getLatestData(dataNum,deviceId='DOJHBG',measureType="TSC"):
+    connection = pymysql.connect(host='localhost',
+                             user='root',
+                             password='Ghs@2211',
+                             database='SEMS',
+                             cursorclass=pymysql.cursors.DictCursor)
+    with connection:
+        with connection.cursor() as cursor:
+            # 定义 SQL 查询
+            sql = """
+                SELECT 
+                    f.furnaceNumber,
+                    f.measureStartDatetime,
+                    f.measureEndDatetime,
+                    f.Temperature,
+                    f.C,
+                    f.P,
+                    f.S,
+                    f.Mn,
+                    f.Ni,
+                    f.Mo,
+                    f.Cr,
+                    ot.spectralData,
+                    ot.spectralDim,
+                    ot.spatialDim
+                FROM 
+                    furnace f
+                JOIN 
+                    online_test ot ON f.furnaceId = ot.furnaceId
+                WHERE 
+                    f.deviceId = %s AND
+                    f.measureType = %s
+                ORDER BY 
+                    f.furnaceID DESC
+                LIMIT %s;
+                """
+            values=(deviceId,measureType,dataNum)
+            cursor.execute(sql,values)
+            result = cursor.fetchall()
+    return result
+
+        
+
+
+
+def getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC"):
+    connection = pymysql.connect(host='localhost',
+                             user='root',
+                             password='Ghs@2211',
+                             database='SEMS',
+                             cursorclass=pymysql.cursors.DictCursor)
+    dataNum=0
+    with connection:
+        with connection.cursor() as cursor:
+            # 定义 SQL 查询
+            sql = """
+            SELECT 
+                COUNT(*) 
+            FROM 
+                furnace as f
+            WHERE 
+                f.deviceId = %s AND
+                f.measureType = %s
+            """
+            values=(deviceId,measureType)
+
+            cursor.execute(sql,values)
+            result = cursor.fetchall()
+            dataNum=result[0]["COUNT(*)"]
+
+            sql = """
+                SELECT furnaceNumber
+                FROM furnace as f
+                WHERE 
+                    f.deviceId = %s AND
+                    f.measureType = %s
+                ORDER BY furnaceId DESC
+                LIMIT 1;
+            """
+            values=(deviceId,measureType)
+
+            cursor.execute(sql,values)
+            result = cursor.fetchall()
+            lastesFurnaceNumber=result[0]["furnaceNumber"]
+    return dataNum,lastesFurnaceNumber
+
+
+def saveDataset(caches, datasetDir,datasetType="train"):
+    os.makedirs(datasetDir/datasetType)
+
+    for cache in caches:
+        print(type(cache))
+        if isinstance(cache,str):
+            oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset")
+            shutil.copy(oldRawDatasetDir/cache, datasetDir/datasetType)
+        if isinstance(cache,dict):
+            print(cache.keys())
+            spectralData=msgpack.unpackb(gzip.decompress(cache["spectralData"]))
+            # spectralData=msgpack.unpack(gzip.decompress(spectralDatawithTime["buffer"]))
+            spectralDataNumpy=np.frombuffer(spectralData["buffer"],dtype=np.uint16).reshape(int(len(spectralData["buffer"])/cache["spectralDim"]/cache["spatialDim"]/2),cache["spectralDim"],cache["spatialDim"])
+
+            np.savez(datasetDir/datasetType/f"{cache["furnaceNumber"]}.npz",furnaceNumber=cache["furnaceNumber"],measureStartDatetime=cache["measureStartDatetime"],measureEndDatetime=cache["measureEndDatetime"],timestamps=spectralData["timestamps"],rawSpectralData=spectralDataNumpy,rawLabels=np.array([cache["Temperature"],cache["C"],cache["P"],cache["S"],cache["Mn"],cache["Ni"],cache["Mo"],cache["Cr"],]),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"])
+            
+
+
+            # break
+
+
+
+def generateRawDataset(datasetDir="/data/SEMS-model-training/dataset/raw_dataset",datasetNum=1000,validationRate=0.2,testRate=0.1):
+        
+        datasetDir=Path(datasetDir)
+
+        
+        oldRawDatasetDir=Path("/data/SEMS-model-training/old_rawdata/dataset")
+        oldRawDatasetFileNames=os.listdir(oldRawDatasetDir)
+        oldRawDatasetFileNames.sort()
+        oldRawDatasetNum=len(oldRawDatasetFileNames)
+        # latestFurnaceID=oldRawDatasetFileNames[-1]
+
+        DBdataNum,lastesFurnaceNumber=getMetadataFromMySQL(deviceId='DOJHBG',measureType="TSC")
+
+        
+
+
+        if os.path.exists(datasetDir):
+            #如果存在数据集，则判断是否为最新
+            if os.path.exists(datasetDir/"validation_set"):
+                #"通过判断文件中的最新炉次号与已存在的数据集中的一不一样
+                valNames=os.listdir(datasetDir/"validation_set")
+                valNames.sort()
+                latestDatasetFurnaceID=valNames[-1]
+                if lastesFurnaceNumber==latestDatasetFurnaceID:
+                    "如果数据集已经是最新的，那就直接返回，不进行任何操作"
+                    return None
+                else:
+                    shutil.rmtree(datasetDir)
+            else:
+                shutil.rmtree(datasetDir)
+        os.makedirs(datasetDir)
+
+        chooseDBdataNum=0
+        chooseOLDRawDataNum=0
+        if (DBdataNum+oldRawDatasetNum)>datasetNum:
+            if DBdataNum>=datasetNum:
+                chooseDBdataNum=datasetNum
+                chooseOLDRawDataNum=0
+            else:
+                chooseDBdataNum=DBdataNum
+                chooseOLDRawDataNum=datasetNum-DBdataNum
+        else:
+            chooseDBdataNum=DBdataNum
+            chooseOLDRawDataNum=oldRawDatasetNum
+
+        print(f"旧数据数量:{chooseOLDRawDataNum}, 新数据数量{chooseDBdataNum}")
+
+        if chooseDBdataNum>0:
+        #   
+            DBDataset=getLatestData(chooseDBdataNum,deviceId='DOJHBG',measureType="TSC")
+        else:
+            DBDataset=[]
+        
+        rawDatasetCache=oldRawDatasetFileNames[-chooseOLDRawDataNum:]+DBDataset
+        print(rawDatasetCache[-1].keys())
+        datasetNum=len(rawDatasetCache)
+        valNum=int(datasetNum*validationRate)
+        testNum=int(datasetNum*testRate)
+        trainNum=datasetNum-valNum-testNum
+
+        saveDataset(rawDatasetCache[:trainNum], datasetDir,datasetType="training")
+        saveDataset(rawDatasetCache[-testNum-valNum:-testNum], datasetDir,datasetType="validation")
+
+        saveDataset(rawDatasetCache[-testNum:], datasetDir,datasetType="test")
+        
+
+        
+
+
+
+
+
+
+if __name__ =="__main__":
+    tmp=generateRawDataset()
+    # getMetadataFromMySQL()
+
+    # getLatestData(dataNum=1)
+
+
+ 
+
--- a/src/data/labels_scaling/pycache/max_min.cpython-312.pyc
+++ b/src/data/labels_scaling/pycache/max_min.cpython-312.pyc
--- a/src/data/labels_scaling/pycache/standardization.cpython-312.pyc
+++ b/src/data/labels_scaling/pycache/standardization.cpython-312.pyc
--- a/src/data/labels_scaling/max_min.py
+++ b/src/data/labels_scaling/max_min.py
@ -33,6 +33,7 @@ class LabelScaling:
                self._min[i]=label[i]

    def run(self,label):
+        
        for i in range(label.shape[0]):
            label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
        return  label
@ -42,4 +43,3 @@ class LabelScaling:
        return labels

        
-        
--- a/src/data/labels_scaling/standardization.py
+++ b/src/data/labels_scaling/standardization.py
@ -0,0 +1,75 @@
+'''
+@File    :   max_min.py
+@Time    :   2024/08/12 14:03:38
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   对输出标签进行标准化放缩
+'''
+
+
+
+import numpy as np
+from pathlib import Path
+class LabelScaling:
+    def __init__(self):
+        self.description=f"{str(self.__class__).split(".")[-2]}"
+        self.flag_get_global_info=True
+        self._min=None
+        self._max=None
+        self._mean=None
+        self._var=None
+        self._num=0
+
+        self.file_path = Path(__file__).absolute()
+
+
+
+    def load_state_dict(self,state):
+        self._min=state["_min"]
+        self._max=state["_max"]
+        self._mean=state["_mean"]
+        self._var=state["_var"]
+        self._num=state["_num"]
+    def state_dict(self):
+        return {"_min":self._min,"_max":self._max,"_mean":self._mean,"_var":self._var,"_num":self._num}
+    
+        
+    def get_global_info(self,label):
+        label=label.astype(np.float32)
+        label_dim=label.shape[0]
+        if self._max is  None:
+            self._min=np.zeros((label_dim),dtype=np.float32)+999999999
+            self._max=np.zeros((label_dim),dtype=np.float32)
+            self._mean=np.zeros((label_dim),dtype=np.float32)
+            self._var=np.zeros((label_dim),dtype=np.float32)
+            self._num=0
+
+        self._min=np.minimum(self._min,label)
+        self._max=np.maximum(self._max,label)
+        
+        if self._num>0:
+            self._var=((self._num-1)/(self._num))*self._var+(1/(self._num+1))*(label-self._mean)*(label-self._mean)
+        #要先更新var再更新mean
+        self._mean=(self._num/(self._num+1))*self._mean+(1/(self._num+1))*label
+        self._num =self._num+1
+
+        # print("num:",self._num,"_mean:",self._mean,"_var:",self._var,"_min:",self._min,"_max:",self._max)
+        # print("_var:",self._var)
+
+
+
+    def run(self,label):
+        # for i in range(label.shape[0]):
+        #     label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
+        label=(label-self._mean)/np.sqrt(self._var)
+
+        return  label
+    def reverse(self,labels):
+        # for i in range(labels.shape[1]):
+        #     labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
+
+        labels=labels*np.sqrt(self._var)+self._mean
+        return labels
+
+        
--- a/src/data/pre_process.py
+++ b/src/data/pre_process.py
@ -14,6 +14,8 @@ import numpy as np
 import datetime
 import pickle
 import h5py
+from pathlib import Path
+from tqdm import tqdm

 class DataPreProcess:
    """
@ -23,274 +25,85 @@ class DataPreProcess:
    3. 将feature 与 Label进行放缩
    
    """
-    def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,choose_frame_spatial, features_scaling,labels_scaling,train_ratio=0.8,validate_ratio=0.1) -> None:
+    def __init__(self, dataset_dir,choose_frame_spatial, features_scaling,labels_scaling) -> None:
        """初始化类，传入所有数据预处理需要的参数
-
-        Args:
-            raw_spectral_data_dir (_type_):原始光谱数据路径
-            raw_labels_dir (_type_): 原始标签路径
-            labels_name (_type_): 提取出的标签名
-            dataset_dir (_type_): 生成的数据集文件夹
-            choose_frame_spatial (_type_): 类对象，进行光谱特征挑选
-            features_scaling (_type_): 类对象，对输入的特征进行放缩，使之直接输入神经网络
-            labels_scaling (_type_): 类对象，对输出的的标签进行放缩
-            train_ratio (float, optional): 训练集比例. Defaults to 0.8.
-            validate_ratio (float, optional): 验证集比例，剩余的即为测试集. Defaults to 0.1.
        """
-        self.raw_spectral_data_dir=raw_spectral_data_dir
-        self.raw_labels_dir=raw_labels_dir
-        self.dataset_dir=dataset_dir
-        self.labels_name=labels_name
+
+        self.dataset_dir=Path(dataset_dir)
+
        self.choose_frame_spatial=choose_frame_spatial
        self.features_scaling=features_scaling
        self.labels_scaling=labels_scaling
-        self.train_ratio=train_ratio
-        self.validate_ratio=validate_ratio
+        self.labelNames=None

-        #加载原始的标签
-        self.raw_labels=self._load_raw_labels(raw_labels_dir,labels_name)
+        # _raw_data_2_dataset(self)
+        self.run()
    
-        #加载原始光谱的缓存
-        self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
+    def _raw_data_2_pre_dataset(self,pre_dataset_save_dir,dataset_type="test"):

-        #随便加载一个csv文件，判断光谱数据的维度
-        self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
+        rawdata_names= os.listdir(self.dataset_dir/"raw_dataset"/dataset_type)
+        if not os.path.exists(pre_dataset_save_dir/dataset_type):
+            os.makedirs(pre_dataset_save_dir/dataset_type)
+        print(f"[预处理]: {dataset_type}数据集选择特征时间与空间点 ")
+        for name in tqdm(rawdata_names):
                
-        #正式开始原始数据转化为数据集
-        self._raw_data_2_dataset()
+                tmp_data=np.load(self.dataset_dir/"raw_dataset"/dataset_type/name,allow_pickle=True)

+                raw_spectral_data=self.choose_frame_spatial.run(tmp_data["measureStartDatetime"],tmp_data["measureEndDatetime"],tmp_data["timestamps"],tmp_data["rawSpectralData"])

+                flag_data_augmentation=False
+                ################################
+                #此段代码是为数据增强，raw_spectral_data是二维数据，代表多个光谱曲线，使每个光谱曲线都对应相同的label。

-    def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
-        """读取利用脚本处理后的钢厂给的excel文件所在文件夹（会扫描所有文件）
-        并选择指定的label名作为output
-        并去除值为NaN或0的行
-
-        Args:
-            raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
-            labels_name (list): 指定的作为Label的列
-
-        Returns:
-            pd.DataFrame: 返回所有筛选后的炉次数据
-        """
-
-        raw_labels=None
-        for name in   os.listdir(raw_labels_dir):  
-            tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
-
-            choosed_column=["TSC_start_time","TSC_end_time"]
-            choosed_column=choosed_column+labels_name
-
-    
-            #只选出我们想要的部分作为标签
-            tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
-
-            # 选出有NULL的行
-            null_rows=tmp_raw_labels.isnull().any(axis=1)
-            #  # 选出有0的行
-            zeros_rows=(tmp_raw_labels==0).any(axis=1)  | (tmp_raw_labels=='0').any(axis=1)
-
-            # # 每一行但凡有NULL或者0都给删了
-            selected_rows=~(null_rows|zeros_rows)
-            tmp_raw_labels=tmp_raw_labels[selected_rows]
-
-            if raw_labels is None:
-                raw_labels=tmp_raw_labels
-            else:
-                raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
-            logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
-        logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
-        
-        return raw_labels
-    def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
-        """生成所有原始光谱数据文件的缓存，包括每个文件记录的开始及结束时间，目的为加快后面读取原始数据的速度
-
-        Args:
-            raw_spectral_data_dir (str): 原始光谱所在路径
-
-        Returns:
-            list: 缓存，其中有多少个成员，就有多少个原始数据文件，每个成员包括格式为datetime的开始及结束时间，及文件路径
-        """
-        spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
-        cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
-
-        update_flag=False
-
-
-        #不存在缓存文件，以及缓存文件中数据文件个数与文件夹中的文件个数不一致，均重新生成
-        if len(cache_file_paths)==0:
-            logging.debug(f"Raw spectral data cache is not existed! Generating")
-            update_flag=True
-        elif len(cache_file_paths)==1:
-            with open(cache_file_paths[0],"rb") as f:
-                raw_spectral_data_cache=pickle.load(f)
-            if len(raw_spectral_data_cache) !=len(spectral_file_paths):
-                logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
-                update_flag=True
-        else:
-            logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
-
-        if update_flag:
-            spectral_file_paths.sort()
-            raw_spectral_data_cache=[]
-            for file in spectral_file_paths:
-                tmp_info={}
-                tmp_data=np.loadtxt(file, delimiter=",")
-                start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
-                end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
-                tmp_info["start_t"]=start_t
-                tmp_info["end_t"]=end_t
-                tmp_info["file_path"]=file
-                raw_spectral_data_cache.append(tmp_info)
-
-            with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
-                pickle.dump(raw_spectral_data_cache,f)
-
-        return raw_spectral_data_cache
-    def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
-        data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
-        if data[0,2]==229376:
-            spectral_dim =224
-            spatial_dim=512
-        if data[0,2]==917504:
-            spectral_dim =448
-            spatial_dim=1024
-        return spectral_dim, spatial_dim
-    
-    def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
-        """获取从start_time到end_time的光谱数据
-
-        Args:
-            start_time (datetime.datetime): 开始时间
-            end_time (datetime.datetime): 结束时间
-
-        Returns:
-            np.ndarray: 原始光谱数据
-        """
-        
-
-        def get_spectral_data_per_file(file_path,s_t,e_t):
-            data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
-
-            if s_t is not None:
-                tmp_s=datetime.datetime.timestamp(s_t)*1000
-                tmp_s_index=0
-                for i in range(data.shape[0]-1):
-                    if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
-                        tmp_s_index=i
-                        break
-            else:
-                tmp_s_index=0
-            if e_t is not None:
-                tmp_e=datetime.datetime.timestamp(e_t)*1000
-                tmp_e_index=data.shape[0]
-                for i in range(tmp_s_index,data.shape[0]-1):
-                    if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
-                        tmp_e_index=i
-                        break
-            else:
-                tmp_e_index=data.shape[0]
-
-            with open(file_path[:-3]+"bin", "rb") as f:
-                f.seek(data[tmp_s_index,1])
-                d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
-                d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
-            return data[tmp_s_index:tmp_e_index,0],d
-
-        timestamps=None
-        raw_spectral_data=None
-        for tmp_info in self.raw_spectral_data_cache:
-            tmp_data=None
-            if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and  end_time<tmp_info["end_t"]:
-             # 目标时间段在交于此文件时间段的左侧。所以取从文件开始到，end time的数据
-                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
-            
-            elif start_time<tmp_info["start_t"] and  end_time>tmp_info["end_t"]:
-                # 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
-                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
-            elif start_time>tmp_info["start_t"] and  end_time<tmp_info["end_t"]:
-                # 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
-                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
-            elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and  end_time>tmp_info["end_t"]:
-                 # 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
-                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
-            if tmp_data is not None:
-                if raw_spectral_data is None:
-                    timestamps=tmp_time_stamp
-                    raw_spectral_data=tmp_data
-                else:
-                    timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
-                    raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
-        return timestamps,raw_spectral_data
-                
-    def _raw_data_2_dataset(self):
-
-        save_dir=os.path.join(self.dataset_dir,self.choose_frame_spatial.description)
-
-        #第一步，进行特征时间与空间点选取，并保存
-        
-        pre_dataset_save_dir=os.path.join(save_dir,"data")
-
-        if not os.path.exists(pre_dataset_save_dir):
-            os.makedirs(pre_dataset_save_dir)
-
-            #数据路径不存在就重新生成，如果存在，就跳过这部分。
-            # ！！！！！！！！！！因此需要额外注意，如果有新数据，需要把dataset下的文件夹都删除，相当于删除缓存，重新生成
-            for i in range(self.raw_labels.shape[0]):
-                
-                start_time,end_time=self.choose_frame_spatial.get_data_interval(self.raw_labels.iloc[i]["TSC_start_time"],self.raw_labels.iloc[i]["TSC_end_time"])
-                timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
+                if flag_data_augmentation:

                    if raw_spectral_data is not None:
-                    #获取到的数据帧率大于2才开始记录
-                    if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
-                        logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
-                        raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
-                        np.savez(os.path.join(pre_dataset_save_dir,f"{timestamps[0]}_{timestamps.shape[0]}.npz",),timestamps=timestamps,raw_spectral_data=raw_spectral_data,raw_labels=self.raw_labels.iloc[i][self.labels_name].to_numpy())
+
+                        for i in range(raw_spectral_data.shape[0]):
+                            np.savez(pre_dataset_save_dir/dataset_type/f"{name[:-4]}_{i}.npz",furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data[i,...],rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"])
+
+
+
+
+
+
+                ###############################
                else:
-            logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
+                    if raw_spectral_data is not None:
+                        np.savez(pre_dataset_save_dir/dataset_type/name,furnaceNumber=tmp_data["furnaceNumber"],measureStartDatetime=tmp_data["measureStartDatetime"],measureEndDatetime=tmp_data["measureEndDatetime"],timestamps=tmp_data["timestamps"],rawSpectralData=raw_spectral_data,rawLabels=tmp_data["rawLabels"],labelNames=tmp_data["labelNames"])

-        #第二步，进行标准化，并形成数据集
+                #####################

-        self.dataset_save_dir=os.path.join(save_dir,f"{self.features_scaling.description}_{self.labels_scaling.description}")
+    def _pre_dataset_2_dataset(self,pre_dataset_save_dir,dataset_save_dir,dataset_type="test",savaFlag=True):
         
-        if not os.path.exists(self.dataset_save_dir):
-            os.makedirs(self.dataset_save_dir)
-            pre_dataset_files=os.listdir(pre_dataset_save_dir)
+        print(f"[预处理]: {dataset_type}数据集进行数据预处理")
+        pre_dataset_files=os.listdir(pre_dataset_save_dir/dataset_type)
        
-            if self.features_scaling.flag_get_global_info:
+        
+
+        if dataset_type=="training" and (self.features_scaling.flag_get_global_info or self.labels_scaling.flag_get_global_info):
                for name in pre_dataset_files:
-                    tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
-                    self.features_scaling.get_global_info(tmp_data["raw_spectral_data"])
+                    tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True)
+                    if self.labelNames is None:
+                        self.labelNames=tmp_data["labelNames"]
+                    if self.features_scaling.flag_get_global_info:
+                        self.features_scaling.get_global_info(tmp_data["rawSpectralData"])
                    if self.labels_scaling.flag_get_global_info:
-                        self.labels_scaling.get_global_info(tmp_data["raw_labels"])
-            
-            # 获取维度信息
-            tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
-            feature_dim=tmp_data["raw_spectral_data"].shape
-            label_dim=tmp_data["raw_labels"].shape
+                        self.labels_scaling.get_global_info(tmp_data["rawLabels"])
+        np.set_printoptions(suppress = True)
+        # print(f"[预处理]Label全局信息：_num",self.labels_scaling._num,"_mean:",self.labels_scaling._mean,"_var:",self.labels_scaling._var,"_min:",self.labels_scaling._min,"_max:",self.labels_scaling._max)

        
-            #划分训练集_验证集_测试集，
-            np.random.shuffle(pre_dataset_files)
-            [train_dateset_files,validate_dateset_files]=np.split(pre_dataset_files,[int(len(pre_dataset_files)*self.train_ratio)]) 
-            [validate_dateset_files,test_dateset_files]=np.split(validate_dateset_files,[int(len(pre_dataset_files)*self.validate_ratio)]) 
+        if savaFlag:
             
+            tmp_data=np.load(pre_dataset_save_dir/dataset_type/pre_dataset_files[0],allow_pickle=True)
+            feature_dim=tmp_data["rawSpectralData"].shape
+            label_dim=tmp_data["rawLabels"].shape

-            #写入HDF5文件
-            for dataset_name in ["train","validate","test"]:
-                if dataset_name=="train":
-                        file_names=train_dateset_files
-                elif  dataset_name=="validate":
-                        file_names=validate_dateset_files
-                elif  dataset_name=="test":
-                        file_names=test_dateset_files
-                
-                logging.info(f"Generating {dataset_name} dataset with {len(file_names)} samples")
-                with h5py.File(os.path.join( self.dataset_save_dir,f"{dataset_name}.h5"), 'w') as f:
+            with h5py.File(os.path.join( dataset_save_dir,f"{dataset_type}.h5"), 'w') as f:
                    h5_features = f.create_dataset(
                    'features',
-                    tuple([len(file_names)]+list(feature_dim)),
+                    tuple([len(pre_dataset_files)]+list(feature_dim)),
                    maxshape=(tuple([None]+list(feature_dim))),
                        chunks=tuple([1]+list(feature_dim)), # 手动设置 chunk 大小为一次数据的大小tuple([1]+list(feature_dim))
                        # compression='gzip',
@ -299,52 +112,104 @@ class DataPreProcess:
                    )
                    h5_labels = f.create_dataset(
                    'labels',
-                    tuple([len(file_names)]+list(label_dim)),
+                    tuple([len(pre_dataset_files)]+list(label_dim)),
                        chunks=tuple([1]+list(label_dim)), # 手动设置 chunk 大小为一次数据的大小
                        dtype=np.float32,
                    )
-                    for i,name in enumerate(file_names):
-                        tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
+                    for i,name in enumerate(pre_dataset_files):
+                        tmp_data=np.load(os.path.join(pre_dataset_save_dir,dataset_type,name),allow_pickle=True)

-                        feature=self.features_scaling.run(tmp_data["raw_spectral_data"])
-                        label=self.labels_scaling.run(tmp_data["raw_labels"])
+                        feature=self.features_scaling.run(tmp_data["rawSpectralData"])
+                        label=self.labels_scaling.run(tmp_data["rawLabels"])
                        h5_features[i]=feature.astype(np.float32)
                        h5_labels[i]=label.astype(np.float32)
-        else:
-            pre_dataset_files=os.listdir(pre_dataset_save_dir)
         
-            if self.labels_scaling.flag_get_global_info:
-                for name in pre_dataset_files:
-                    tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
-                    self.labels_scaling.get_global_info(tmp_data["raw_labels"])
+
+    def run(self):
+
+        save_dir=self.dataset_dir/self.choose_frame_spatial.description
+
+        #第一步，进行特征时间与空间点选取，并保存
+        
+        pre_dataset_save_dir=save_dir/"pre_dataset"
+
+        if not os.path.exists(pre_dataset_save_dir):
+            os.makedirs(pre_dataset_save_dir)
+
+            #数据路径不存在就重新生成，如果存在，就跳过这部分。
+            # ！！！！！！！！！！因此需要额外注意，如果有新数据，需要把dataset下的文件夹都删除，相当于删除缓存，重新生成
+            self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="training")
+            self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="validation")
+            self._raw_data_2_pre_dataset(pre_dataset_save_dir,dataset_type="test")
+           
+        else:
+            logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
+            
+        #第二步，进行标准化，并形成数据集
+
+        self.dataset_save_dir=save_dir / f"{self.features_scaling.description}_{self.labels_scaling.description}"
+
+        if not os.path.exists(self.dataset_save_dir):
+            os.makedirs(self.dataset_save_dir)
+            self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=True)
+            self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=True)
+            self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=True)
+            
+          
+        else:
+
+            self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="training",savaFlag=False)
+            self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="validation",savaFlag=False)
+            self._pre_dataset_2_dataset( pre_dataset_save_dir,self.dataset_save_dir,dataset_type="test",savaFlag=False)
            logging.info(f"Standardized Dataset is existed in {self.dataset_save_dir}")

    def get_metric(self,outputs,labels):
+        # print("outputs Before",outputs)
        outputs=self.labels_scaling.reverse(outputs)
+        # print("outputs After",outputs)
+        # print("labels Before",labels)
        labels=self.labels_scaling.reverse(labels)
+        # print("labels After",labels)
        error=outputs-labels

-        # print("outputs",outputs)
-        # print("labels",labels)
-        # print("errors",error)
+        if len(error.shape)==1:
+            hit_rate=np.zeros((1))
+        else:
            hit_rate=np.zeros(error.shape[1])
-        for i,name in enumerate(self.labels_name):
-            # error[:,i]=outputs[:,i]-labels[:,i]

-            #["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
+        bounds=np.zeros(error.shape)

-            if name =="TSC_T":
-                bound=10
-            elif name=="TSC_C":
-                bound=0.05
-            elif name=="TSC_C_lab":
-                bound=0.05
-            elif name=="TSC_P_lab":
-                bound=0.0005
+        # ["Temperature","C","P","S","Mn","Ni","Mo","Cr"]


-            hit_rate[i]=((error[:,i]>=-bound) &(error[:,i]<=bound)).sum() /error.shape[0]
-        return error,hit_rate
+        for i,name in enumerate(self.labelNames):
+            
+            if name =="Temperature":
+                bounds[:,i]=10.0+np.zeros(labels[:,i].shape)
+            elif name=="C":
+                bounds[:,i]=0.05+np.zeros(labels[:,i].shape)
+            elif name=="P":
+                bounds[:,i]=0.005+np.zeros(labels[:,i].shape)
+            elif name=="S":
+                bounds[:,i]=0.01+np.zeros(labels[:,i].shape)
+            elif name=="Mn":
+                bounds[:,i]=0.05+np.zeros(labels[:,i].shape)
+            elif name=="Ni":
+                bounds[:,i]=0.25*labels[:,i]
+            elif name=="Mo":
+                bounds[:,i]=0.25*labels[:,i]
+            elif name=="Cr":
+                bounds[:,i]=0.25*labels[:,i]
+
+            # if isinstance(bound, float):
+
+            #     print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound}")
+            # else:
+            #     print(f"{name}outputs:{outputs[0,i]}labels:{labels[0,i]}error:{error[0,i]}bound:{bound[0]}")
+            
+            hit_rate[i]=((error[:,i]>=-bounds[:,i]) &(error[:,i]<=bounds[:,i])).sum() /error.shape[0]
+        
+        return outputs, labels,error,hit_rate,bounds



@ -355,19 +220,13 @@ if __name__=="__main__":
    logging.basicConfig(level = logging.DEBUG)


-    raw_data_dir="/code/admin/20240806-NanEr-5-8-data/rawdata"
-    labels_path="/code/admin/20240806-NanEr-5-8-data/labels/NanEr"
-    dataset_dir="/code/admin/20240806-NanEr-5-8-data/dataset"
-    labels_name=["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
+    datasetDir="/data/SEMS-model-training/dataset"


-    from choose_frame_spatial.mean import ChooseFrameSpatial
-    from features_scaling.max_min import FeatureScaling
-    from labels_scaling.max_min import LabelScaling
-    choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
+    from choose_frame_spatial.mean_CARS100_3 import ChooseFrameSpatial
+    from features_scaling.standardization import FeatureScaling
+    from labels_scaling.standardization import LabelScaling
+    choose_frame_spatial=ChooseFrameSpatial()
    features_scaling=FeatureScaling()
    labels_scaling=LabelScaling()
-    
-
-
-    data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir,choose_frame_spatial,features_scaling,labels_scaling)
+    data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling)
--- a/src/data/spectral_dataset.py
+++ b/src/data/spectral_dataset.py
@ -4,10 +4,10 @@ import os
 import time


-def load_dataset(dataset_dir):
-    train_dataset=SpectralDataset(os.path.join(dataset_dir,"train.h5"))
-    val_dataset=SpectralDataset(os.path.join(dataset_dir,"validate.h5"))
-    return train_dataset, val_dataset
+def load_dataset(dataset_dir,dataset_type="training"):
+    dataset=SpectralDataset(os.path.join(dataset_dir,f"{dataset_type}.h5"))
+    # val_dataset=SpectralDataset(os.path.join(dataset_dir,"validation.h5"))
+    return dataset

 class SpectralDataset(torch.utils.data.Dataset):
    def __init__(self, hdf5_path):
@ -17,8 +17,8 @@ class SpectralDataset(torch.utils.data.Dataset):
                    # rdcc_w0=0,  # 缓存淘汰策略。值越接近 0，优先淘汰最近最少使用的 chunk。值越接近 1，优先淘汰已完全读取或写入的块
                    # rdcc_nslots=1e8,  # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍，为达最佳性能需要 100 倍。默认为 521
        )
-        self.features=self.hdf5_f["features"]
-        self.labels=self.hdf5_f["labels"]
+        self.features=self.hdf5_f["features"][...]
+        self.labels=self.hdf5_f["labels"][...]
    def __len__(self):
        return self.features.shape[0]
    
@ -26,10 +26,15 @@ class SpectralDataset(torch.utils.data.Dataset):


        #构造Input
+        
        feature=self.features[idx]

        label=self.labels[idx]

+        
+
+        
+        
        return feature, label
    

@ -37,7 +42,7 @@ class SpectralDataset(torch.utils.data.Dataset):
 if __name__ =="__main__":

    
-    training_data=SpectralDataset("/home/admin/20240806-NanEr-5-8-data/dataset/mean_-30_30/max_min_max_min/train.h5")
+    training_data=SpectralDataset("/data/SEMS-model-training/dataset/ChooseFrameSpatial'>_-30_30/max_min_max_min/training.h5")
    print("数据集大小为：",len(training_data))
    print("输入Feature维度为：", training_data[0][0].shape, "输出Label维度为：",training_data[0][1].shape)

--- a/src/data/test.ipynb
+++ b/src/data/test.ipynb
@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[预处理]: training数据集进行数据预处理\n",
+      "[预处理]Label全局信息：_num 549 _mean: [1591.5325        0.39140984    0.04271309    0.02187012    0.14065096\n",
+      "    0.02857196    0.00202732    0.10250819] _var: [786.9018       0.01670736   0.00040787   0.0000213    0.00179902\n",
+      "   0.00326453   0.00004455   0.00067495] _min: [1480.        0.079     0.0005    0.0003    0.0005    0.001     0.001\n",
+      "    0.001 ] _max: [1666.        0.829     0.0957    0.044     0.7264    0.823     0.152\n",
+      "    0.522 ]\n",
+      "[预处理]: validation数据集进行数据预处理\n",
+      "[预处理]Label全局信息：_num 549 _mean: [1591.5325        0.39140984    0.04271309    0.02187012    0.14065096\n",
+      "    0.02857196    0.00202732    0.10250819] _var: [786.9018       0.01670736   0.00040787   0.0000213    0.00179902\n",
+      "   0.00326453   0.00004455   0.00067495] _min: [1480.        0.079     0.0005    0.0003    0.0005    0.001     0.001\n",
+      "    0.001 ] _max: [1666.        0.829     0.0957    0.044     0.7264    0.823     0.152\n",
+      "    0.522 ]\n",
+      "[预处理]: test数据集进行数据预处理\n",
+      "[预处理]Label全局信息：_num 549 _mean: [1591.5325        0.39140984    0.04271309    0.02187012    0.14065096\n",
+      "    0.02857196    0.00202732    0.10250819] _var: [786.9018       0.01670736   0.00040787   0.0000213    0.00179902\n",
+      "   0.00326453   0.00004455   0.00067495] _min: [1480.        0.079     0.0005    0.0003    0.0005    0.001     0.001\n",
+      "    0.001 ] _max: [1666.        0.829     0.0957    0.044     0.7264    0.823     0.152\n",
+      "    0.522 ]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pre_process import DataPreProcess\n",
+    "\n",
+    "\n",
+    "datasetDir=\"/data/SEMS-model-training/dataset\"\n",
+    "\n",
+    "\n",
+    "from choose_frame_spatial.DBSCAN import ChooseFrameSpatial\n",
+    "from features_scaling.standardization import FeatureScaling\n",
+    "from labels_scaling.standardization import LabelScaling\n",
+    "choose_frame_spatial=ChooseFrameSpatial()\n",
+    "features_scaling=FeatureScaling()\n",
+    "labels_scaling=LabelScaling()\n",
+    "data_pre_process=DataPreProcess(datasetDir,choose_frame_spatial,features_scaling,labels_scaling)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1591.5325    ,    0.39140984,    0.04271309,    0.02187012,\n",
+       "          0.14065096,    0.02857196,    0.00202732,    0.10250819],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "labels_scaling._mean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Before: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n",
+      "Scaled: [0.7486169269676385, 0.46089706984319634, 1.2160017876067477, 7.584260858761072, 2.26002739629723, 0.612572700572906, 0.21319305953525988, 13.385111606844243]\n",
+      "Reversed\t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n",
+      "Before    \t: [1612.5325, 0.450984, 0.067271309, 0.05687012, 0.2365096, 0.06357196, 0.0034502732, 0.450250819]\n"
+     ]
+    }
+   ],
+   "source": [
+    "labels= [1612.5325  ,   0.450984,    0.067271309 ,   0.05687012 ,   0.2365096, 0.06357196  ,  0.0034502732  ,  0.450250819]\n",
+    "print(f\"Before: {labels}\")\n",
+    "tmp=labels_scaling.run(labels)\n",
+    "print(f\"Scaled: {list(tmp)}\")\n",
+    "labels_r=labels_scaling.reverse(tmp)\n",
+    "print(f\"Reversed\\t: {list(labels_r)}\")\n",
+    "print(f\"Before    \\t: {labels}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dl",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/src/main_train.py
+++ b/src/main_train.py
@ -5,6 +5,10 @@ import ray,os
 import ray.tune
 from functools import partial

+import shutil
+
+import subprocess
+



@ -24,6 +28,16 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None:
    hydra_output_dir=hydra_output_dir['runtime']['output_dir']


+    command = f"""
+                sudo kill -9 $(sudo lsof -t -i :6006);
+                tensorboard --logdir {os.path.join(hydra_output_dir,"ray-tune")} --host 0.0.0.0 --port 6006
+                """
+
+
+    # 创建子进程
+    tensorboard_subprocess = subprocess.Popen(command, shell=True, executable='/bin/bash',start_new_session=True)
+
+
    #获取当前超参数配置
    tune_cfg={}
    for key in hydra_cfg.ray_tune.config.keys():
@ -44,10 +58,24 @@ def main(hydra_cfg : omegaconf.DictConfig) -> None:
    )

    #保存最好的模型
+    
    best_trial = result.get_best_trial("val_loss", "min", "last")
+
+
+    
+    
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final validation loss: {best_trial.last_result['val_loss']}")

+    print(f"Best trial checkpoint path: {best_trial.checkpoint.path}")
+
+    shutil.copytree(best_trial.checkpoint.path, os.path.join(hydra_output_dir,"best-model"))
+
+    # tensorboard_subprocess.terminate()
+    # tensorboard_subprocess.wait()
+    # print("TensorBoard terminated")
+    
+



--- a/src/model/DRSN-CW.py
+++ b/src/model/DRSN-CW.py
@ -0,0 +1,160 @@
+import torch
+import torch.nn as nn
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_channels, out_channels, stride=1):
+        super().__init__()
+        self.shrinkage = Shrinkage(out_channels, gap_size=(1))
+        # residual function
+        self.residual_function = nn.Sequential(
+            nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm1d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm1d(out_channels * BasicBlock.expansion),
+            self.shrinkage
+        )
+        # shortcut
+        self.shortcut = nn.Sequential()
+
+        # the shortcut output dimension is not the same with residual function
+        # use 1*1 convolution to match the dimension
+        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv1d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm1d(out_channels * BasicBlock.expansion)
+            )
+
+    def forward(self, x):
+
+         return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
+        # a = self.residual_function(x),
+        # b = self.shortcut(x),
+        # c = a+b
+        # return c
+
+
+class Shrinkage(nn.Module):
+    def __init__(self, channel, gap_size):
+        super(Shrinkage, self).__init__()
+        self.gap = nn.AdaptiveAvgPool1d(gap_size)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel, channel),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        x_raw = x
+        x = torch.abs(x)
+        x_abs = x
+        x = self.gap(x)
+        x = torch.flatten(x, 1)
+        # average = torch.mean(x, dim=1, keepdim=True)  #CS
+        average = x    #CW
+        x = self.fc(x)
+        x = torch.mul(average, x)
+        x = x.unsqueeze(2)
+        # soft thresholding
+        sub = x_abs - x
+        zeros = sub - sub
+        n_sub = torch.max(sub, zeros)
+        x = torch.mul(torch.sign(x_raw), n_sub)
+        return x
+
+
+class SpectralModel(nn.Module):
+
+    def __init__(self, block=BasicBlock, num_block=[3, 4, 6, 3], num_classes=8):
+        super().__init__()
+
+        self.in_channels = 64
+
+        self.conv1 = nn.Sequential(
+            nn.Conv1d(1, 64, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm1d(64),
+            nn.ReLU(inplace=True))
+        # we use a different inputsize than the original paper
+        # so conv2_x's stride is 1
+        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
+        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
+        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
+        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
+        self.avg_pool = nn.AdaptiveAvgPool1d((1))
+        # self.fc = nn.Linear(512 * block.expansion, 1024)
+        # self.fc = nn.Linear(1024 , 512)
+        # self.fc = nn.Linear(512, 512)
+        self.fc = nn.Sequential(
+            nn.BatchNorm1d(512),
+            nn.Linear(512 * block.expansion, 1024),
+            nn.ReLU(inplace=True),nn.BatchNorm1d(1024),
+            nn.Linear(1024 , 512),
+            nn.ReLU(inplace=True),nn.BatchNorm1d(512),
+            nn.Linear(512, 128),
+            nn.ReLU(inplace=True),nn.BatchNorm1d(128),
+            nn.Linear(128, 8),)
+
+    def _make_layer(self, block, out_channels, num_blocks, stride):
+        """make rsnet layers(by layer i didnt mean this 'layer' was the
+        same as a neuron netowork layer, ex. conv layer), one layer may
+        contain more than one residual shrinkage block
+
+        Args:
+            block: block type, basic block or bottle neck block
+            out_channels: output depth channel number of this layer
+            num_blocks: how many blocks per layer
+            stride: the stride of the first block of this layer
+
+        Return:
+            return a rsnet layer
+        """
+
+        # we have num_block blocks per layer, the first block
+        # could be 1 or 2, other blocks would always be 1
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_channels, out_channels, stride))
+            self.in_channels = out_channels * block.expansion
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x=torch.unsqueeze(x,1)
+        output = self.conv1(x)
+        output = self.conv2_x(output)
+        output = self.conv3_x(output)
+        output = self.conv4_x(output)
+        output = self.conv5_x(output)
+        output = self.avg_pool(output)
+        output = output.view(output.size(0), -1)
+        output = self.fc(output)
+
+        return output
+
+
+# def rsnet18():
+#     """ return a RSNet 18 object
+#     """
+#     return RSNet(BasicBlock, [2, 2, 2, 2])
+
+
+# def rsnet34():
+#     """ return a RSNet 34 object
+#     """
+#     return RSNet(BasicBlock, [3, 4, 6, 3])
+
+if __name__=='__main__':
+    import torchinfo 
+    model = SpectralModel()
+    model.eval()
+    # print(model)
+    input = torch.randn(64,224)
+    y = model(input)
+    print(y.size())
+
+    torchinfo.summary(model,input_size=(64,224))
--- a/src/model/pycache/CNN_FCNN_big.cpython-312.pyc
+++ b/src/model/pycache/CNN_FCNN_big.cpython-312.pyc
--- a/src/model/pycache/CNN_FCNN_small.cpython-312.pyc
+++ b/src/model/pycache/CNN_FCNN_small.cpython-312.pyc
--- a/src/model/pycache/CNN_LSTM_FCNN.cpython-312.pyc
+++ b/src/model/pycache/CNN_LSTM_FCNN.cpython-312.pyc
--- a/src/model/pycache/DRSN-CW.cpython-312.pyc
+++ b/src/model/pycache/DRSN-CW.cpython-312.pyc
--- a/src/model/pycache/FCNN_DBSCAN_big.cpython-312.pyc
+++ b/src/model/pycache/FCNN_DBSCAN_big.cpython-312.pyc
--- a/src/model/pycache/FCNN_DBSCAN_small.cpython-312.pyc
+++ b/src/model/pycache/FCNN_DBSCAN_small.cpython-312.pyc
--- a/src/model/pycache/FNN_CARS.cpython-312.pyc
+++ b/src/model/pycache/FNN_CARS.cpython-312.pyc
--- a/src/model/pycache/FNN_CARS_big.cpython-312.pyc
+++ b/src/model/pycache/FNN_CARS_big.cpython-312.pyc
--- a/src/model/pycache/FNN_CARS_small.cpython-312.pyc
+++ b/src/model/pycache/FNN_CARS_small.cpython-312.pyc
--- a/src/model/history/CNN1D_FCNN.py
+++ b/src/model/history/CNN1D_FCNN.py
--- a/src/model/history/CNN_FCNN_big.py
+++ b/src/model/history/CNN_FCNN_big.py
@ -3,9 +3,9 @@ import torch.nn.functional as F
 import torchinfo 
 import torch

-class CNN_FCNN(nn.Module):
+class SpectralModel(nn.Module):
    def __init__(self,):
-        super(CNN_FCNN, self).__init__()
+        super(SpectralModel, self).__init__()
        
        self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=512, kernel_size=3,stride=1)
        self.Conv2d_2=nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3,stride=1)
@ -24,7 +24,7 @@ class CNN_FCNN(nn.Module):
        self.fc_2=nn.Linear(in_features=2048,out_features=1024)
        self.fc_3=nn.Linear(in_features=1024,out_features=512)
        self.fc_4=nn.Linear(in_features=512,out_features=256)
-        self.fc_5=nn.Linear(in_features=256,out_features=4)
+        self.fc_5=nn.Linear(in_features=256,out_features=8)


        # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) 
@ -71,6 +71,6 @@ class CNN_FCNN(nn.Module):
        return x
    
 if __name__=="__main__":
-    model=CNN_FCNN()
+    model=SpectralModel()

-    torchinfo.summary(model,input_size=(64, 48, 224, 10))
+    torchinfo.summary(model,input_size=(64, 20, 224, 10))
--- a/src/model/history/CNN_FCNN_small.py
+++ b/src/model/history/CNN_FCNN_small.py
@ -0,0 +1,78 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import torchinfo 
+import torch
+
+class SpectralModel(nn.Module):
+    def __init__(self,):
+        super(SpectralModel, self).__init__()
+        
+        self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3,stride=1)
+        self.Conv2d_2=nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3,stride=1)
+        # self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2)
+
+        self.Conv2d_3=nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3,stride=1)
+
+        self.Conv2d_4=nn.Conv2d(in_channels=16, out_channels=8, kernel_size=3,stride=1)
+        self.Conv2d_5=nn.Conv2d(in_channels=8, out_channels=4, kernel_size=2,stride=1)
+        self.flatten=nn.Flatten(start_dim=1, end_dim=3)
+
+
+
+        self.fc_1=nn.Linear(in_features=860,out_features=512)  #2048->1024
+
+        self.fc_2=nn.Linear(in_features=512,out_features=256)
+        self.fc_3=nn.Linear(in_features=256,out_features=128)
+        self.fc_4=nn.Linear(in_features=128,out_features=64)
+        self.fc_5=nn.Linear(in_features=64,out_features=8)
+
+
+        # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) 
+
+
+
+
+
+    def forward(self, x):
+        # input_shape=x.shape
+
+        x=torch.mean(x,dim=1,keepdim=True)
+
+        # x=torch.unsqueeze(x, 1)
+        x=F.tanh(self.Conv2d_1(x))
+        x=F.tanh(self.Conv2d_2(x))
+        # x=self.MaxPool2d_1(x)
+        x=F.tanh(self.Conv2d_3(x))
+        x=F.tanh(self.Conv2d_4(x))
+        x=F.tanh(self.Conv2d_5(x))
+        
+        x=self.flatten(x)
+        print(x.shape)
+        x=F.tanh(self.fc_1(x))
+        x=F.tanh(self.fc_2(x))
+        x=F.tanh(self.fc_3(x))
+        x=F.tanh(self.fc_4(x))
+        x=F.sigmoid(self.fc_5(x))
+
+        # 
+        # x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
+        # x=nn.Flatten(start_dim=2, end_dim=4)(x)
+
+        
+        # x,_=self.lstm_1(x)
+
+        # x=x[:,-1,:]
+
+
+        # x=nn.LeakyReLU()(self.fc_1(x))
+        # x=nn.LeakyReLU()(self.fc_2(x))
+        # x=nn.Sigmoid()(self.fc_3(x))
+
+
+
+        return x
+    
+if __name__=="__main__":
+    model=SpectralModel()
+
+    torchinfo.summary(model,input_size=(64, 20, 224, 10))
--- a/src/model/history/CNN_LSTM_FCNN.py
+++ b/src/model/history/CNN_LSTM_FCNN.py
@ -3,9 +3,9 @@ import torch.nn.functional as F
 import torchinfo 
 import torch

-class CNN_LSTM_FCNN(nn.Module):
+class SpectralModel(nn.Module):
    def __init__(self,):
-        super(CNN_LSTM_FCNN, self).__init__()
+        super(SpectralModel, self).__init__()
        
        # self.reshape_1=nn.Flatten(start_dim=0, end_dim=1)
        self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=256, kernel_size=3,stride=1)
@ -27,7 +27,7 @@ class CNN_LSTM_FCNN(nn.Module):
        self.fc_1=nn.Linear(in_features=2048,out_features=1024)  #2048->1024

        self.fc_2=nn.Linear(in_features=1024,out_features=256)
-        self.fc_3=nn.Linear(in_features=256,out_features=4)
+        self.fc_3=nn.Linear(in_features=256,out_features=8)


        # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) 
@ -64,6 +64,6 @@ class CNN_LSTM_FCNN(nn.Module):
        return x
    
 if __name__=="__main__":
-    model=CNN_LSTM_FCNN()
+    model=SpectralModel()

-    torchinfo.summary(model,input_size=(64, 48, 224, 10))
+    torchinfo.summary(model,input_size=(64, 20, 224, 10))
--- a/src/model/history/FCNN.py
+++ b/src/model/history/FCNN.py
--- a/src/model/history/FCNN_DBSCAN_big.py
+++ b/src/model/history/FCNN_DBSCAN_big.py
@ -0,0 +1,55 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import torchinfo 
+import torch
+
+class SpectralModel(nn.Module):
+    def __init__(self,):
+        super(SpectralModel, self).__init__()
+        self.norm=nn.BatchNorm1d(224)
+        self.fc1 = nn.Linear(224, 1024)
+        self.fc2 = nn.Linear(1024, 2048)
+        self.fc3 = nn.Linear(2048, 4096)
+        self.fc4 = nn.Linear(4096, 2048)
+        self.fc5 = nn.Linear(2048, 1024)
+        self.fc6 = nn.Linear(1024, 512)
+
+        self.fc7 = nn.Linear(512, 128)
+        self.fc8 = nn.Linear(128, 8)
+
+
+
+    def forward(self, x):
+        x=self.norm(x)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        x = F.relu(self.fc4(x))
+        x = F.relu(self.fc5(x))
+        x = F.relu(self.fc6(x))
+        x = F.relu(self.fc7(x))
+        x = F.relu(self.fc8(x))
+        x = F.sigmoid(x)
+        return x
+# class SpectralModel(nn.Module):
+#     def __init__(self,):
+#         super(SpectralModel, self).__init__()
+        
+#         self.norm=nn.BatchNorm1d(224)
+#         self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)])
+
+
+
+
+#     def forward(self, x):
+#         x=self.norm(x)
+        
+#         res = [submodel(x) for submodel in self.submodels]
+#         x=torch.cat(res,dim=1)
+        
+#         return x
+    
+if __name__=="__main__":
+    model=SpectralModel().to("cuda:0")
+
+    torchinfo.summary(model,input_size=(64, 224))
--- a/src/model/history/FCNN_DBSCAN_small.py
+++ b/src/model/history/FCNN_DBSCAN_small.py
@ -0,0 +1,68 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import torchinfo 
+import torch
+
+class SmallFCNNModel(nn.Module):
+    def __init__(self,):
+        super(SmallFCNNModel, self).__init__()
+        
+        self.fc1 = nn.Linear(224, 50)
+
+        self.fc9 = nn.Linear(50, 1)
+
+
+
+    def forward(self, x):
+        # x=self.norm(x)
+        x = F.relu(self.fc1(x))
+        x = F.sigmoid(self.fc9(x))
+        return x
+class SpectralModel(nn.Module):
+    def __init__(self,):
+        super(SpectralModel, self).__init__()
+        
+        self.norm=nn.BatchNorm1d(224)
+        # self.fc1 = nn.Linear(100, 50)
+        # # self.fc2 = nn.Linear(1024, 2048)
+        # # self.fc3 = nn.Linear(2048, 4096)
+        # # self.fc4 = nn.Linear(4096, 2048)
+        # # self.fc5 = nn.Linear(2048, 1024)
+        # # self.fc6 = nn.Linear(1024, 512)
+        # # self.fc7 = nn.Linear(512, 256)
+        # # self.fc8 = nn.Linear(256, 128)
+        # self.fc9 = nn.Linear(50, 8)
+        # # self.fc4 = nn.Linear(10240, 4)
+        self.submodels = nn.ModuleList([SmallFCNNModel() for _ in range(8)])
+
+
+
+
+    def forward(self, x):
+        x=self.norm(x)
+        
+        res = [submodel(x) for submodel in self.submodels]
+        # res=[self.submodels[0](x)]
+
+        # for i in range(1,len(self.submodels)):
+        #     res.append(self.submodels[i](0*x))
+
+
+
+
+        # x = F.relu(self.fc2(x))
+        # x = F.relu(self.fc3(x))
+        # x = F.relu(self.fc4(x))
+        # x = F.relu(self.fc5(x))
+        # x = F.relu(self.fc6(x))
+        # x = F.relu(self.fc7(x))
+        # x = F.relu(self.fc8(x))
+        # x = F.sigmoid(self.fc9(x))
+        x=torch.cat(res,dim=1)
+        
+        return x
+    
+if __name__=="__main__":
+    model=SpectralModel().to("cuda:0")
+
+    torchinfo.summary(model,input_size=(64, 224))
--- a/src/optimizer/pycache/trainable.cpython-312.pyc
+++ b/src/optimizer/pycache/trainable.cpython-312.pyc
--- a/src/optimizer/pycache/utils.cpython-312.pyc
+++ b/src/optimizer/pycache/utils.cpython-312.pyc
--- a/src/optimizer/trainable.py
+++ b/src/optimizer/trainable.py
@ -22,6 +22,8 @@ import numpy as np

 import matplotlib.pyplot as plt

+from optimizer.utils import save_inference_files
+
 logger = logging.getLogger("ray")

 def trainable(hydra_cfg,tune_cfg):
@ -39,7 +41,7 @@ def trainable(hydra_cfg,tune_cfg):
    ##  原始数据预处理为数据集  ##
    ############################
    t_1=time.time()
-    data_preprocess=DataPreProcess( hydra_cfg.raw_spectral_data_dir,hydra_cfg.raw_labels_dir,hydra_cfg.labels_name,hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]),hydra_cfg.dataset.train_ratio,hydra_cfg.dataset.validate_ratio)
+    data_preprocess=DataPreProcess( hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]))
    t_2=time.time()
    logger.info(f"Preprocessed raw data costs {t_2-t_1}s")

@ -47,12 +49,18 @@ def trainable(hydra_cfg,tune_cfg):
    ##  加载数据集为dataloader ##
    ############################

-    train_dataset,val_dataset=load_dataset(data_preprocess.dataset_save_dir)
+    train_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="training")
+    val_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="validation")
+
+
+
+
+
    trainloader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
+        train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
    )
    valloader = torch.utils.data.DataLoader(
-        val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
+        val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
    )
    t_3=time.time()
    logger.info(f"Dataloader costs {t_3-t_2}s")
@ -101,7 +109,7 @@ def trainable(hydra_cfg,tune_cfg):
    ##  生成模型架构图写入tensorboad ##
    ################################
    sample = train_dataset[0:4][0]
-    print(sample.shape)
+    # print(sample.shape)
    writer.add_graph(model, torch.tensor(sample).to(device))


@ -113,6 +121,7 @@ def trainable(hydra_cfg,tune_cfg):
        ################################
        ##          模型训练           ##
        ################################
+        
        logger.info(f"Start epoch {epoch+1}")
        train_loss = 0.0
        train_steps = 0
@ -121,12 +130,29 @@ def trainable(hydra_cfg,tune_cfg):
        train_errors=None
        train_outputs=None
        train_labels=None
+        train_bounds=None
        model.train()
+        
+        # t_1=time.time()
+        # for batch, (features, labels) in enumerate(trainloader):
+        #     features=features.to(device)
+        #     labels=labels.to(device)
+
+        # t_2=time.time()
+        # print(f"DEBUDING cost{t_2-t_1}s")
+
        for batch, (features, labels) in enumerate(trainloader):
+
+            
+           
            t_iteration_start=time.time()
            features=features.to(device)
            labels=labels.to(device)
            
+
+            
+            
+
            # zero the parameter gradients
            optimizer.zero_grad()

@ -136,9 +162,13 @@ def trainable(hydra_cfg,tune_cfg):

            labels_npy=labels.cpu().detach().numpy()
            outputs_npy=outputs.cpu().detach().numpy()
+            # print("outputs_npy",outputs_npy)
+            # print("labels_npy",labels_npy)

            # 生成自定义的指标
-            error,hit_rate=data_preprocess.get_metric(outputs_npy, labels_npy)
+            outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
+            # print("outputs_npy_after",outputs_npy)
+            # print("labels_npy_after",labels_npy)

            # Backpropagation
            loss.backward()
@ -147,16 +177,21 @@ def trainable(hydra_cfg,tune_cfg):
            # 记录我们自定义的指标
            train_loss += loss.item()
            train_steps += 1
+            
            if train_steps==1:
                train_hit_rate=hit_rate
                train_errors=error
                train_outputs=outputs_npy
                train_labels=labels_npy
+                train_bounds=bounds
            else:
+
+                
                train_hit_rate=(train_steps-1)/train_steps *train_hit_rate+ 1/train_steps*hit_rate
                train_errors=np.concatenate((train_errors,error),axis=0)
                train_outputs=np.concatenate((train_outputs,outputs_npy),axis=0)
                train_labels=np.concatenate((train_labels,labels_npy),axis=0)
+                train_bounds=np.concatenate((train_bounds,bounds),axis=0)

            t_iteration_end=time.time()
            print("Training Epoch:[{}/{}],Iteration:{}/{},Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_steps,len(trainloader),loss.item(),t_iteration_end-t_iteration_start))
@ -168,6 +203,9 @@ def trainable(hydra_cfg,tune_cfg):
        
        val_loss = 0.0
        val_steps = 0
+        val_errors=None
+        val_outputs=None
+        val_labels=None
        val_hit_rate=None
        
        t_epoch_train_end=time.time()
@ -177,20 +215,30 @@ def trainable(hydra_cfg,tune_cfg):
        for batch, (features, labels) in enumerate(valloader):
            t_iteration_start=time.time()
            with torch.no_grad():
+
                features=features.to(device)
                labels=labels.to(device)

                outputs = model(features)
                loss = criterion(outputs, labels)

-                error,hit_rate=data_preprocess.get_metric(outputs.cpu().detach().numpy(), labels.cpu().detach().numpy())
+                labels_npy=labels.cpu().detach().numpy()
+                outputs_npy=outputs.cpu().detach().numpy()
+
+                outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)

                val_loss += loss.cpu().numpy()
                val_steps += 1
                if val_steps==1:
                    val_hit_rate=hit_rate
+                    val_errors=error
+                    val_outputs=outputs_npy
+                    val_labels=labels_npy
                else:
                    val_hit_rate=(val_steps-1)/val_steps *val_hit_rate+ 1/val_steps*hit_rate
+                    val_errors=np.concatenate((val_errors,error),axis=0)
+                    val_outputs=np.concatenate((val_outputs,outputs_npy),axis=0)
+                    val_labels=np.concatenate((val_labels,labels_npy),axis=0)
            t_iteration_end=time.time()
            print("Validate Iteration:{}/{},Loss:{}, Cost {}s".format(val_steps,len(valloader),loss.item(),t_iteration_end-t_iteration_start))

@ -206,41 +254,157 @@ def trainable(hydra_cfg,tune_cfg):



-        checkpoint_data = {
-                "epoch": epoch,
-                "net_state_dict": model.state_dict(),
-                "optimizer_state_dict": optimizer.state_dict(),
-            }
        
        
-        with tempfile.TemporaryDirectory() as checkpoint_dir:
-                data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
-                with open(data_path, "wb") as fp:
-                    pickle.dump(checkpoint_data, fp)
-
-                checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)
-
        reports={"val_loss": val_loss / val_steps,
                     "train_loss": train_loss,
                     }
-                for i,name in enumerate(hydra_cfg.labels_name):
-                    reports[f"train_hit_rate_{name}"]=train_hit_rate[i]
-                    reports[f"val_hit_rate_{name}"]=val_hit_rate[i]
-                    writer.add_histogram(tag=f'train_error_{name}', values=train_errors[:,i], global_step=epoch)
+
+        for i,name in enumerate(data_preprocess.labelNames):
+                    # reports[=
+            writer.add_scalar(f"{name}/training/hit_rate",train_hit_rate[i],global_step=epoch)
+            writer.add_scalar(f"{name}/validation/hit_rate",val_hit_rate[i],global_step=epoch)
+
+        if (epoch!=0 and epoch%hydra_cfg.train.checkpoint_interval==0):
+
+
+            for i,name in enumerate(data_preprocess.labelNames):
+                    # reports[=
+
+                    writer.add_histogram(tag=f'{name}/training/error_histogram', values=train_errors[:,i], global_step=epoch)
+                    writer.add_histogram(tag=f'{name}/validation/error_histogram', values=val_errors[:,i], global_step=epoch)

                    fig, ax = plt.subplots()
-                    ax.scatter(np.arange(train_outputs.shape[0]),train_outputs[:,i])
-                    ax.scatter(np.arange(train_outputs.shape[0]),train_labels[:,i])
-                    writer.add_figure(f'train_imgage_{name}', fig , epoch)
+                    ax.scatter(train_labels[:,i],train_outputs[:,i])
+                    ax.plot([train_labels[:,i].min(), train_labels[:,i].min()], [train_labels[:,i].max(), train_labels[:,i].max()], 'r--')
+                    ax.plot(train_labels[:,i],train_labels[:,i]-train_bounds[:,i], 'r--')
+                    ax.plot(train_labels[:,i],train_labels[:,i]+train_bounds[:,i], 'r--')
+
+                    
+                    ax.set_xlabel('Actual Values')
+                    ax.set_ylabel('Estimated Values')
+                    writer.add_figure(f'{name}/training/actual_vs_estimated_value', fig , epoch)
                    plt.close(fig)  
+
+
+                    fig, ax = plt.subplots()
+                    ax.scatter(val_labels[:,i],val_outputs[:,i])
+                    ax.plot([val_labels[:,i].min(), val_labels[:,i].max()], [val_outputs[:,i].min(), val_outputs[:,i].max()], 'r--')
+                    ax.set_xlabel('Actual Values')
+                    ax.set_ylabel('Estimated Values')
+                    writer.add_figure(f'{name}/validation/actual_vs_estimated_value', fig , epoch)
+                    plt.close(fig)  
+
+
+
+                    fig, ax = plt.subplots()
+                    ax.scatter(train_labels[:,i],train_errors[:,i])
+                    ax.plot(train_labels[:,i],[0 for i in range(train_labels.shape[0])], color='r', linestyle='--')
+                    ax.plot(train_labels[:,i],-train_bounds[:,i], 'r--')
+                    ax.plot(train_labels[:,i],+train_bounds[:,i], 'r--')
+                    ax.set_xlabel('Actual Values')
+                    ax.set_ylabel('Residual error')
+
+                    writer.add_figure(f'{name}/training/training_actual_vs_residual', fig , epoch)
+                    plt.close(fig)     
+
+
+
+                    fig, ax = plt.subplots()
+                    ax.scatter(val_labels[:,i],val_errors[:,i])
+                    ax.plot(val_labels[:,i],[0 for i in range(val_labels.shape[0])], color='r', linestyle='--')
+                    ax.set_xlabel('Actual Values')
+                    ax.set_ylabel('Residual error')
+                    writer.add_figure(f'{name}/validation/training_actual_vs_Residual', fig , epoch)
+                    plt.close(fig) 
+
+            with tempfile.TemporaryDirectory() as checkpoint_dir:
+                checkpoint_data = {
+                "epoch": epoch,
+                "net_state_dict": model.state_dict(),
+                "optimizer_state_dict": optimizer.state_dict(),
+                "data_preprocess":data_preprocess
+
+            }
+                data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
+                with open(data_path, "wb") as fp:
+                    pickle.dump(checkpoint_data, fp)
+                
+                save_inference_files(model,data_preprocess,checkpoint_dir)
+
+                checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)          
                ray.train.report(
                    reports,
                    checkpoint=checkpoint,
                )
+        else:
+            ray.train.report(reports,)
        t_epoch_end=time.time()
        logger.info(f"Save Checkpoint costs {t_epoch_end-t_epoch_val_end}s")

        print("Training Epoch:[{}/{}], Average Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_loss,t_epoch_end-t_epoch_start))
-        for i,name in enumerate(hydra_cfg.labels_name):
-            print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end="  ")
-            print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end="  ")
+        # for i,name in enumerate(hydra_cfg.labels_name):
+        #     print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end="  ")
+        #     print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end="  ")
+
+    #####################################################
+    ##   每个epoch保存checkpoint，并记录到tensorboard   ##
+    #####################################################
+
+    #测试
+
+    test_dataset=load_dataset(data_preprocess.dataset_save_dir,dataset_type="test")
+
+    testloader = torch.utils.data.DataLoader(
+        test_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker,drop_last=False
+    )
+
+
+
+
+    test_loss = 0.0
+    test_steps = 0
+    test_errors=None
+    test_outputs=None
+    test_labels=None
+    test_hit_rate=None
+    logger.info(f"Test starts")
+    t_epoch_test_start=time.time()
+
+    model.eval()
+
+    for batch, (features, labels) in enumerate(testloader):
+            t_iteration_start=time.time()
+            with torch.no_grad():
+                features=features.to(device)
+                labels=labels.to(device)
+
+                outputs = model(features)
+                loss = criterion(outputs, labels)
+
+                labels_npy=labels.cpu().detach().numpy()
+                outputs_npy=outputs.cpu().detach().numpy()
+
+                outputs_npy, labels_npy,error,hit_rate,bounds=data_preprocess.get_metric(outputs_npy, labels_npy)
+
+                test_loss += loss.cpu().numpy()
+                test_steps += 1
+                if test_steps==1:
+                    test_hit_rate=hit_rate
+                    test_errors=error
+                    test_outputs=outputs_npy
+                    test_labels=labels_npy
+                else:
+                    test_hit_rate=(test_steps-1)/test_steps *test_hit_rate+ 1/test_steps*hit_rate
+                    test_errors=np.concatenate((test_errors,error),axis=0)
+                    test_outputs=np.concatenate((test_outputs,outputs_npy),axis=0)
+                    test_labels=np.concatenate((test_labels,labels_npy),axis=0)
+            t_iteration_end=time.time()
+            print("Test Iteration:{}/{},Loss:{}, Cost {}s".format(test_steps,len(testloader),loss.item(),t_iteration_end-t_iteration_start))
+
+    t_epoch_test_end=time.time()
+    logger.info(f"Test costs {t_epoch_test_end-t_epoch_test_start}s")
+    tmp_string=""
+    for i,name in enumerate(data_preprocess.labelNames):
+        tmp_string+=f"{name}\t:{test_hit_rate[i]*100:.2f}%\n"
+    writer.add_text(f"test/hit_rate",tmp_string)
--- a/src/optimizer/utils.py
+++ b/src/optimizer/utils.py
@ -0,0 +1,48 @@
+import shutil
+import pathlib
+import pickle
+import torch
+def save_inference_files(model,data_preprocess,checkpoint_dir):
+    sava_dir=pathlib.Path(checkpoint_dir)/"inference"
+    print("开始在checkpoint中保存推理所需文件")
+    print("choose_frame_spatial文件路径：",data_preprocess.choose_frame_spatial.file_path.parent)
+
+    #保存choose_frame_spatial相关文件
+
+    
+    sava_dir.mkdir(mode=0o777, parents=True, exist_ok=True)
+
+    shutil.copy(data_preprocess.choose_frame_spatial.file_path,sava_dir/"choose_frame_spatial.py")
+    with open(sava_dir/"choose_frame_spatial.pkl",'wb') as f:
+        pickle.dump(data_preprocess.choose_frame_spatial.state_dict(),f)
+
+
+    shutil.copy(data_preprocess.features_scaling.file_path,sava_dir/"features_scaling.py")
+    with open(sava_dir/"features_scaling.pkl",'wb') as f:
+        pickle.dump(data_preprocess.features_scaling.state_dict(),f)
+    
+
+    shutil.copy(data_preprocess.labels_scaling.file_path,sava_dir/"labels_scaling.py")
+    with open(sava_dir/"labels_scaling.pkl",'wb') as f:
+        pickle.dump(data_preprocess.labels_scaling.state_dict(),f)
+
+    with open(sava_dir/"labelNames.pkl",'wb') as f:
+        pickle.dump(data_preprocess.labelNames,f)
+
+
+    input_tensor = torch.rand((1,*data_preprocess.features_scaling.feature_shape), dtype=torch.float32).to("cuda:0")
+    torch.onnx.export(
+            model,                  # model to export
+            (input_tensor,),        # inputs of the model,
+            str(sava_dir/"model.onnx"),        # filename of the ONNX model
+            input_names=["input"],  # Rename inputs for the ONNX model
+            dynamo=True             # True or False to select the exporter to use
+        )
+
+    
+
+    
+
+
+
+
--- a/src/scripts/Add_Lab.py
+++ b/src/scripts/Add_Lab.py
@ -0,0 +1,149 @@
+import pandas as pd
+# from tqdm import tqdm
+if __name__ =="__main__":
+
+    label_file_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335.xlsx"
+    raw_label_lab_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份转炉TSC和TSO化验结果.xls"
+
+    save_dir="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx"
+
+    labels=pd.read_excel(label_file_path)
+    
+    print(f"原始数据量{labels.shape[0]}")
+    labels=labels.loc[:,["Furnace_Number","Steel_Type","TSC_start_time","TSC_end_time","TSC_T","TSC_C","TSO_start_time","TSO_end_time","TSO_T","TSO_C"]]
+
+        #下述功能集成到数据预处理中。因为每一行但凡有0就删，太严格，应该要求指定的label有0就删
+    # # 选出有NULL的行
+    null_rows=labels.isnull().any(axis=1)
+    #  # 选出有0的行
+    zeros_rows=(labels==0).any(axis=1)  | (labels=='0').any(axis=1)
+
+    # # 每一行但凡有NULL或者0都给删了
+    selected_rows=~(null_rows|zeros_rows)
+    labels=labels[selected_rows]
+
+    print(f"删除无效数据后{labels.shape[0]}")
+
+    labels_output=labels.copy()
+
+    labels_output['TSC_P']=0.0
+    labels_output['TSC_S']=0.0
+    labels_output['TSC_Mn']=0.0
+    labels_output['TSC_Ni']=0.0
+    labels_output['TSC_Mo']=0.0
+    labels_output['TSC_Cr']=0.0
+    labels_output['TSO_P']=0.0
+    labels_output['TSO_S']=0.0
+    labels_output['TSO_Mn']=0.0
+    labels_output['TSO_Ni']=0.0
+    labels_output['TSO_Mo']=0.0
+    labels_output['TSO_Cr']=0.0
+
+
+
+
+    
+
+    raw_label_lab=pd.read_excel(raw_label_lab_file_path)
+    raw_label_lab=raw_label_lab.loc[:,["炉次号","分析时间","Mn","P","S","Ni","Cr","Mo"]]
+    # print(raw_label_lab)
+
+    for i in range(labels.shape[0]):
+        # i=4
+        furnaceID=labels.iloc[i]["Furnace_Number"]
+
+        # print(raw_label_lab[raw_label_lab["炉次号"]==furnaceID])
+
+        tmp=raw_label_lab[raw_label_lab["炉次号"]==furnaceID]
+
+        if tmp.shape[0]==0:
+            print(f"炉次号{furnaceID}找不到对应的数据")
+            continue
+        elif tmp.shape[0]==1:
+            print(f"炉次号{furnaceID}找到1个")
+            labels_output.loc[i,'TSC_P']=tmp.iloc[0]["P"]
+            labels_output.loc[i,'TSC_S']=tmp.iloc[0]["S"]
+            labels_output.loc[i,'TSC_Mn']=tmp.iloc[0]["Mn"]
+            labels_output.loc[i,'TSC_Ni']=tmp.iloc[0]["Ni"]
+            labels_output.loc[i,'TSC_Mo']=tmp.iloc[0]["Mo"]
+            labels_output.loc[i,'TSC_Cr']=tmp.iloc[0]["Cr"]
+            labels_output.loc[i,'TSO_P']=tmp.iloc[0]["P"]
+            labels_output.loc[i,'TSO_S']=tmp.iloc[0]["S"]
+            labels_output.loc[i,'TSO_Mn']=tmp.iloc[0]["Mn"]
+            labels_output.loc[i,'TSO_Ni']=tmp.iloc[0]["Ni"]
+            labels_output.loc[i,'TSO_Mo']=tmp.iloc[0]["Mo"]
+            labels_output.loc[i,'TSO_Cr']=tmp.iloc[0]["Cr"]
+        else:
+            print(f"炉次号{furnaceID}找到{tmp.shape[0]}个")
+
+            #寻找离TSC最近的
+            min_time=None
+            min_index=0
+            for j in range(tmp.shape[0]):
+                # print(j,i)
+                delta_time=tmp.iloc[j]["分析时间"]-labels.iloc[i]["TSC_end_time"]
+                if min_time is None:
+                    min_time=delta_time
+                else:
+                    if  delta_time<min_time:
+                        min_time=delta_time
+                        min_index=j
+            # print(min_time,min_index)
+
+            labels_output.loc[i,'TSC_P']=tmp.iloc[min_index]["P"]
+            labels_output.loc[i,'TSC_S']=tmp.iloc[min_index]["S"]
+            labels_output.loc[i,'TSC_Mn']=tmp.iloc[min_index]["Mn"]
+            labels_output.loc[i,'TSC_Ni']=tmp.iloc[min_index]["Ni"]
+            labels_output.loc[i,'TSC_Mo']=tmp.iloc[min_index]["Mo"]
+            labels_output.loc[i,'TSC_Cr']=tmp.iloc[min_index]["Cr"]
+            # labels.iloc[i]['TSO_P']=tmp.iloc[i]["P"]
+            # labels.iloc[i]['TSO_S']=tmp.iloc[i]["S"]
+            # labels.iloc[i]['TSO_Mn']=tmp.iloc[i]["Mn"]
+            # labels.iloc[i]['TSO_Ni']=tmp.iloc[i]["Ni"]
+            # labels.iloc[i]['TSO_Mo']=tmp.iloc[i]["Mo"]
+            # labels.iloc[i]['TSO_Cr']=tmp.iloc[i]["Cr"]
+
+            #寻找离TSO最近的
+            min_time=None
+            min_index=0
+            for j in range(tmp.shape[0]):
+                # print(j,i)
+                # print(tmp.iloc[j]["分析时间"],labels.iloc[i]["TSO_end_time"])
+                delta_time=tmp.iloc[j]["分析时间"]-labels.iloc[i]["TSO_end_time"]
+                if min_time is None:
+                    min_time=delta_time
+                else:
+                    if  delta_time<min_time:
+                        min_time=delta_time
+                        min_index=j
+            # print(min_time,min_index)
+
+            labels_output.loc[i,'TSO_P']=tmp.iloc[min_index]["P"]
+            labels_output.loc[i,'TSO_S']=tmp.iloc[min_index]["S"]
+            labels_output.loc[i,'TSO_Mn']=tmp.iloc[min_index]["Mn"]
+            labels_output.loc[i,'TSO_Ni']=tmp.iloc[min_index]["Ni"]
+            labels_output.loc[i,'TSO_Mo']=tmp.iloc[min_index]["Mo"]
+            labels_output.loc[i,'TSO_Cr']=tmp.iloc[min_index]["Cr"]
+
+    null_rows=labels_output.isnull().any(axis=1)
+    #  # 选出有0的行
+    zeros_rows=(labels_output==0).any(axis=1)  | (labels_output=='0').any(axis=1)
+
+    # # 每一行但凡有NULL或者0都给删了
+    selected_rows=~(null_rows|zeros_rows)
+    labels_output=labels_output[selected_rows]
+    print(labels_output)
+    print(f"插入实验室数据的后的大小{labels_output.shape[0]}")
+
+    labels_output.to_excel(save_dir,index=False)
+
+
+
+
+    
+
+
+
+    # for i in range(labels.shape[0]):
+    #     print(f"[{i+1}/{labels.shape[0]}],炉次号:{labels.iloc[i]["Furnace_Number"]}")
+    #     furnaceID=labels.iloc[i]["Furnace_Number"]
--- a/src/scripts/Naner_label_excel_standardization.py
+++ b/src/scripts/Naner_label_excel_standardization.py
@ -16,9 +16,10 @@ import os

 if __name__ =="__main__":

-    raw_label_file_path="/home/admin/20240806-NanEr-5-8-data/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
+    raw_label_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
+    # raw_label_lab_file_path="/data/SEMS-model-training/labels/raw_labels/NanEr/5-8月份转炉TSC和TSO化验结果.xls"

-    save_dir="/home/admin/20240806-NanEr-5-8-data/labels/NanEr"
+    save_dir="/data/SEMS-model-training/labels/NanEr"

    select_and_rename={
        "炉号":"Furnace_Number",
--- a/src/scripts/pre_dataset_visual.py
+++ b/src/scripts/pre_dataset_visual.py
@ -0,0 +1,60 @@
+import multiprocessing.pool
+import pathlib
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.decomposition import KernelPCA
+
+from sklearn.manifold import TSNE
+from sklearn.cluster import DBSCAN
+from sklearn import metrics
+from sklearn.cluster import OPTICS
+
+import multiprocessing
+
+
+def plot_npz(file_path:pathlib.Path,output_folder:pathlib.Path)->None:
+
+    output_folder=output_folder
+    output_folder.mkdir(mode=0o777,parents=True,exist_ok=True)
+    
+    data=np.load(file_path,allow_pickle=True)
+    #此处data["rawSpectralData"]的维度为（时间维度，光谱维度，空间维度）
+    rawSpectralData=data["rawSpectralData"]
+
+
+
+    plt.figure()
+    spectrum_band=np.linspace(400,1000,224)
+    # plt.ylim([0,4095])
+    plt.plot(spectrum_band,rawSpectralData)
+    # plt.title("Max Point Spectrum")
+    plt.savefig(output_folder/f"{file_path.stem}.png",dpi=100)
+    plt.close()
+    #画最强点的光谱
+
+
+
+def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None:
+
+    files=list(raw_data_folder.glob("*.npz"))
+
+    # plot_npz(files[0],output_folder )
+
+    p=multiprocessing.Pool(4)
+
+    for i in range(len(files)):
+        p.apply_async(plot_npz, args=(files[i],output_folder))
+    
+    print('Waiting for all subprocesses done...')
+    p.close()
+    p.join()
+    print('All subprocesses done.')
+
+if __name__=="__main__":
+    raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset")
+    output_folder=pathlib.Path("/data/SEMS-model-training/dataset/DBSCAN_eps_0.15_min_samples_10/pre_dataset_visual")
+    main(raw_dataset_folder/"test",output_folder/"test",)
+    main(raw_dataset_folder/"training",output_folder/"training",)
+    main(raw_dataset_folder/"validation",output_folder/"validation",)
--- a/src/scripts/raw_dataset_visual.py
+++ b/src/scripts/raw_dataset_visual.py
@ -0,0 +1,265 @@
+import multiprocessing.pool
+import pathlib
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.decomposition import KernelPCA
+
+from sklearn.manifold import TSNE
+from sklearn.cluster import DBSCAN
+from sklearn import metrics
+from sklearn.cluster import OPTICS
+
+import multiprocessing
+
+
+def plot_npz(file_path:pathlib.Path,output_folder:pathlib.Path)->None:
+
+    output_folder=output_folder/file_path.stem
+    output_folder.mkdir(mode=0o777,parents=True,exist_ok=True)
+    
+    data=np.load(file_path,allow_pickle=True)
+    #此处data["rawSpectralData"]的维度为（时间维度，光谱维度，空间维度）
+    rawSpectralData=data["rawSpectralData"].transpose(0, 2, 1)
+    rawSpectralData=rawSpectralData.reshape((rawSpectralData.shape[0]*rawSpectralData.shape[1],rawSpectralData.shape[2]))
+    #rawSpectralData的维度为（时间维度*空间维度，光谱维度）
+
+    tmp=np.max(rawSpectralData,axis=1)
+    rawSpectralData=rawSpectralData[(tmp>500) & (tmp<4095),:]
+    #选出没有欠曝与过曝的空间点
+
+    
+    rawSpectralData_normed=(rawSpectralData-np.min(rawSpectralData,axis=1,keepdims=True))/(np.max(rawSpectralData,axis=1,keepdims=True)-np.min(rawSpectralData,axis=1,keepdims=True))
+    #归一化所有光谱，去除强度信息
+
+
+    plt.figure()
+    spectrum_band=np.linspace(400,1000,224)
+    index=np.argmax(rawSpectralData)
+    index=np.unravel_index(index, rawSpectralData.shape)
+    plt.ylim([0,4095])
+    plt.plot(spectrum_band,rawSpectralData[index[0],:])
+    plt.title("Max Point Spectrum")
+    plt.savefig(output_folder/"max_point_spectrum.png",dpi=100)
+    plt.close()
+    #画最强点的光谱
+
+
+
+    ################################
+    #  PCA
+    ################################
+    # pca_norm_1D =  PCA(n_components=1)
+    # norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
+    # pca_norm_2D =  PCA(n_components=2)
+    # norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
+    # pca_norm_3D =  PCA(n_components=3)
+    # norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
+
+    # pca_1D =  PCA(n_components=1)
+    # feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData)
+    # pca_2D =  PCA(n_components=2)
+    # feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData)
+    # pca_3D =  PCA(n_components=3)
+    # feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData)
+
+
+    # fig, axs = plt.subplots(2, 3,figsize=(15,10))
+
+    # axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
+    # axs[0, 0].set_title(' norm')
+    # axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
+    # ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
+    # ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
+    
+    
+    # axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
+    # axs[1, 0].set_title(' raw')
+    # axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
+    # ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
+    # ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
+    # plt.savefig(output_folder/"PCA.png",dpi=100)
+
+    # plt.close()
+
+
+
+    ################################
+    #  KernelPCA  gamma =15
+    ################################
+
+    def plot_KernelPCA(gamma):
+        pca_norm_1D =  KernelPCA(n_components=1, kernel='rbf', gamma=gamma)
+        norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
+        pca_norm_2D =  KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
+        norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
+        pca_norm_3D =  KernelPCA(n_components=3, kernel='rbf', gamma=gamma)
+        norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).transform(rawSpectralData_normed)
+
+        pca_1D =  KernelPCA(n_components=1, kernel='rbf', gamma=gamma)
+        feat_1D = pca_1D.fit(rawSpectralData).transform(rawSpectralData)
+        pca_2D =  KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
+        feat_2D = pca_2D.fit(rawSpectralData).transform(rawSpectralData)
+        pca_3D =  KernelPCA(n_components=3, kernel='rbf', gamma=gamma)
+        feat_3D = pca_3D.fit(rawSpectralData).transform(rawSpectralData)
+
+
+        fig, axs = plt.subplots(2, 3,figsize=(15,10))
+
+        axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
+        axs[0, 0].set_title(' norm')
+        axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
+        ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
+        ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
+        
+        
+        axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
+        axs[1, 0].set_title(' raw')
+        axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
+        ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
+        ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
+        plt.savefig(output_folder/f"KernelPCA_gamma_{gamma}.png",dpi=100)
+
+        plt.close()
+
+    # plot_KernelPCA(gamma=None)
+    # plot_KernelPCA(gamma=1)
+    # plot_KernelPCA(gamma=5)
+    # plot_KernelPCA(gamma=10)
+    # plot_KernelPCA(gamma=15)
+
+    
+
+    ################################
+    #  t-SNE
+    ################################
+    # pca_norm_1D =  TSNE(n_components=1, learning_rate='auto', init='pca')
+    # norm_feat_1D = pca_norm_1D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
+    # pca_norm_2D =  TSNE(n_components=2, learning_rate='auto', init='pca')
+    # norm_feat_2D = pca_norm_2D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
+    # pca_norm_3D =  TSNE(n_components=3, learning_rate='auto', init='pca')
+    # norm_feat_3D = pca_norm_3D.fit(rawSpectralData_normed).fit_transform(rawSpectralData_normed)
+
+    # pca_1D =  TSNE(n_components=1, learning_rate='auto', init='pca')
+    # feat_1D = pca_1D.fit(rawSpectralData).fit_transform(rawSpectralData)
+    # pca_2D =  TSNE(n_components=2, learning_rate='auto', init='pca')
+    # feat_2D = pca_2D.fit(rawSpectralData).fit_transform(rawSpectralData)
+    # pca_3D =  TSNE(n_components=3, learning_rate='auto', init='pca')
+    # feat_3D = pca_3D.fit(rawSpectralData).fit_transform(rawSpectralData)
+
+
+    # fig, axs = plt.subplots(2, 3,figsize=(15,10))
+
+    # axs[0,0].scatter(norm_feat_1D, np.zeros((norm_feat_1D.shape[0],)), s=0.1)
+    # axs[0, 0].set_title(' norm')
+    # axs[0,1].scatter(norm_feat_2D[:,0], norm_feat_2D[:,1], s=0.1)
+    # ax_3d = fig.add_subplot(2, 3, 3, projection='3d')
+    # ax_3d.scatter(norm_feat_3D[:, 0], norm_feat_3D[:, 1], norm_feat_3D[:, 2], s=0.01)
+    
+    
+    # axs[1,0].scatter(feat_1D, np.zeros((feat_1D.shape[0],)), s=0.1)
+    # axs[1, 0].set_title(' raw')
+    # axs[1,1].scatter(feat_2D[:,0], feat_2D[:,1], s=0.1)
+    # ax_3d = fig.add_subplot(2, 3, 6, projection='3d')
+    # ax_3d.scatter(feat_3D[:, 0], feat_3D[:, 1], feat_3D[:, 2], s=0.01)
+    # plt.savefig(output_folder/"t-SNE.png",dpi=100)
+
+    # plt.close()
+
+
+    def plot_DBSCAN(eps=0.15, min_samples=10):
+        
+        db_norm = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData_normed)
+
+        labels_norm = db_norm.labels_
+        n_norm = len(set(labels_norm)) - (1 if -1 in labels_norm else 0)
+        n_noise_norm = list(labels_norm).count(-1)
+
+
+        max_i=0
+        max_num=0
+        for i in range(n_norm):
+            tmp=(labels_norm==i).sum()
+            if tmp>max_num:
+                max_i=i
+                max_num=tmp
+
+        fig, axs = plt.subplots(2, 2,figsize=(10,10))
+        for i in range(labels_norm.shape[0]):
+            if labels_norm[i]==max_i:
+                axs[0,0].plot(rawSpectralData_normed[i])
+        axs[0,0].set_title(f'norm data w norm cluster {max_num},{n_norm},{n_noise_norm}')
+
+        for i in range(labels_norm.shape[0]):
+            if labels_norm[i]==max_i:
+                axs[0,1].plot(rawSpectralData[i])
+        axs[0,1].set_title('norm data w norm cluster')
+        
+
+
+
+        db_raw = DBSCAN(eps=eps, min_samples=min_samples).fit(rawSpectralData)
+        labels_raw = db_raw.labels_
+        n_raw = len(set(labels_raw)) - (1 if -1 in labels_raw else 0)
+        n_noise_raw = list(labels_raw).count(-1)
+
+
+        max_i=0
+        max_num=0
+        for i in range(n_raw):
+            tmp=(labels_raw==i).sum()
+            if tmp>max_num:
+                max_i=i
+                max_num=tmp
+
+        for i in range(labels_raw.shape[0]):
+            if labels_raw[i]==max_i:
+                axs[1,0].plot(rawSpectralData_normed[i])
+        axs[1,0].set_title(f'norm data w norm cluster {max_num},{n_raw},{n_noise_raw}')
+
+        for i in range(labels_raw.shape[0]):
+            if labels_raw[i]==max_i:
+                axs[1,1].plot(rawSpectralData[i])
+        axs[1,1].set_title('norm data w norm cluster')
+
+        plt.savefig(output_folder/f"DBSCAN_eps_{eps}_min_samples_{min_samples}.png",dpi=100)
+
+        plt.close()
+    plot_DBSCAN(eps=0.15, min_samples=10)
+    # plot_DBSCAN(eps=100, min_samples=10)
+
+    
+
+
+
+
+
+
+
+    
+    # print(file_path.stem,rawSpectralData.shape)
+
+
+def main(raw_data_folder:pathlib.Path,output_folder:pathlib.Path)->None:
+
+    files=list(raw_data_folder.glob("*.npz"))
+
+    # plot_npz(files[0],output_folder )
+
+    p=multiprocessing.Pool(4)
+
+    for i in range(len(files)):
+        p.apply_async(plot_npz, args=(files[i],output_folder))
+    
+    print('Waiting for all subprocesses done...')
+    p.close()
+    p.join()
+    print('All subprocesses done.')
+
+if __name__=="__main__":
+    raw_dataset_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset")
+    output_folder=pathlib.Path("/data/SEMS-model-training/dataset/raw_dataset_visual")
+    main(raw_dataset_folder/"test",output_folder/"test",)
+    main(raw_dataset_folder/"training",output_folder/"training",)
+    main(raw_dataset_folder/"validation",output_folder/"validation",)
--- a/src/scripts/rawbindata2rawdataset.py
+++ b/src/scripts/rawbindata2rawdataset.py
@ -0,0 +1,276 @@
+'''
+@File    :   pre_process.py
+@Time    :   2024/08/09 13:54:28
+@Author  :   Zhanpeng Yang
+@Version :   0.0.1
+@Contact :   zhpyang@outlook.com
+@Desc    :   进行数据预处理
+'''
+
+import pandas as pd
+import logging
+import os,glob
+import numpy as np
+import datetime
+import pickle
+import h5py
+
+class DataPreProcess:
+    """
+    包含所有数据预处理操作，包括
+    1. 从原始二进制数据，以钢厂 提供的Excel文件中提取出目标时刻附近的光谱
+    2. 从目标时刻附近的光谱选择合适、稳定的时间及空间点的光谱，作为feature
+    3. 将feature 与 Label进行放缩
+    
+    """
+    def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,train_ratio=0.8,validate_ratio=0.1) -> None:
+        """初始化类，传入所有数据预处理需要的参数
+
+        Args:
+            raw_spectral_data_dir (_type_):原始光谱数据路径
+            raw_labels_dir (_type_): 原始标签路径
+            labels_name (_type_): 提取出的标签名
+            dataset_dir (_type_): 生成的数据集文件夹
+            choose_frame_spatial (_type_): 类对象，进行光谱特征挑选
+            features_scaling (_type_): 类对象，对输入的特征进行放缩，使之直接输入神经网络
+            labels_scaling (_type_): 类对象，对输出的的标签进行放缩
+            train_ratio (float, optional): 训练集比例. Defaults to 0.8.
+            validate_ratio (float, optional): 验证集比例，剩余的即为测试集. Defaults to 0.1.
+        """
+        self.raw_spectral_data_dir=raw_spectral_data_dir
+        self.raw_labels_dir=raw_labels_dir
+        self.dataset_dir=dataset_dir
+        self.labels_name=labels_name
+
+        self.train_ratio=train_ratio
+        self.validate_ratio=validate_ratio
+
+        #加载原始的标签
+        self.raw_labels=pd.read_excel(raw_labels_dir)
+        
+        #加载原始光谱的缓存
+        self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
+
+        #随便加载一个csv文件，判断光谱数据的维度
+        self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
+
+        #正式开始原始数据转化为数据集
+        self._raw_data_2_dataset()
+
+
+
+    # def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
+    #     """读取利用脚本处理后的钢厂给的excel文件所在文件夹（会扫描所有文件）
+    #     并选择指定的label名作为output
+    #     并去除值为NaN或0的行
+
+    #     Args:
+    #         raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
+    #         labels_name (list): 指定的作为Label的列
+
+    #     Returns:
+    #         pd.DataFrame: 返回所有筛选后的炉次数据
+    #     """
+
+    #     raw_labels=None
+    #     for name in   os.listdir(raw_labels_dir):  
+    #         tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
+
+    #         choosed_column=["TSC_start_time","TSC_end_time"]
+    #         choosed_column=choosed_column+labels_name
+
+    
+    #         #只选出我们想要的部分作为标签
+    #         tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
+
+    #         # 选出有NULL的行
+    #         null_rows=tmp_raw_labels.isnull().any(axis=1)
+    #         #  # 选出有0的行
+    #         zeros_rows=(tmp_raw_labels==0).any(axis=1)  | (tmp_raw_labels=='0').any(axis=1)
+
+    #         # # 每一行但凡有NULL或者0都给删了
+    #         selected_rows=~(null_rows|zeros_rows)
+    #         tmp_raw_labels=tmp_raw_labels[selected_rows]
+
+    #         if raw_labels is None:
+    #             raw_labels=tmp_raw_labels
+    #         else:
+    #             raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
+    #         logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
+    #     logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
+        
+    #     return raw_labels
+    def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
+        """生成所有原始光谱数据文件的缓存，包括每个文件记录的开始及结束时间，目的为加快后面读取原始数据的速度
+
+        Args:
+            raw_spectral_data_dir (str): 原始光谱所在路径
+
+        Returns:
+            list: 缓存，其中有多少个成员，就有多少个原始数据文件，每个成员包括格式为datetime的开始及结束时间，及文件路径
+        """
+        spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
+        cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
+
+        update_flag=False
+
+
+        #不存在缓存文件，以及缓存文件中数据文件个数与文件夹中的文件个数不一致，均重新生成
+        if len(cache_file_paths)==0:
+            logging.debug(f"Raw spectral data cache is not existed! Generating")
+            update_flag=True
+        elif len(cache_file_paths)==1:
+            with open(cache_file_paths[0],"rb") as f:
+                raw_spectral_data_cache=pickle.load(f)
+            if len(raw_spectral_data_cache) !=len(spectral_file_paths):
+                logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
+                update_flag=True
+        else:
+            logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
+
+        if update_flag:
+            spectral_file_paths.sort()
+            raw_spectral_data_cache=[]
+            for file in spectral_file_paths:
+                tmp_info={}
+                tmp_data=np.loadtxt(file, delimiter=",")
+                start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
+                end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
+                tmp_info["start_t"]=start_t
+                tmp_info["end_t"]=end_t
+                tmp_info["file_path"]=file
+                raw_spectral_data_cache.append(tmp_info)
+
+            with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
+                pickle.dump(raw_spectral_data_cache,f)
+
+        return raw_spectral_data_cache
+    def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
+        data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
+        if data[0,2]==229376:
+            spectral_dim =224
+            spatial_dim=512
+        if data[0,2]==917504:
+            spectral_dim =448
+            spatial_dim=1024
+        return spectral_dim, spatial_dim
+    
+    def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
+        """获取从start_time到end_time的光谱数据
+
+        Args:
+            start_time (datetime.datetime): 开始时间
+            end_time (datetime.datetime): 结束时间
+
+        Returns:
+            np.ndarray: 原始光谱数据
+        """
+        
+
+        def get_spectral_data_per_file(file_path,s_t,e_t):
+            data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
+
+            if s_t is not None:
+                tmp_s=datetime.datetime.timestamp(s_t)*1000
+                tmp_s_index=0
+                for i in range(data.shape[0]-1):
+                    if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
+                        tmp_s_index=i
+                        break
+            else:
+                tmp_s_index=0
+            if e_t is not None:
+                tmp_e=datetime.datetime.timestamp(e_t)*1000
+                tmp_e_index=data.shape[0]
+                for i in range(tmp_s_index,data.shape[0]-1):
+                    if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
+                        tmp_e_index=i
+                        break
+            else:
+                tmp_e_index=data.shape[0]
+
+            with open(file_path[:-3]+"bin", "rb") as f:
+                f.seek(data[tmp_s_index,1])
+                d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
+                d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
+            return data[tmp_s_index:tmp_e_index,0],d
+
+        timestamps=None
+        raw_spectral_data=None
+        for tmp_info in self.raw_spectral_data_cache:
+            tmp_data=None
+            if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and  end_time<tmp_info["end_t"]:
+             # 目标时间段在交于此文件时间段的左侧。所以取从文件开始到，end time的数据
+                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
+            
+            elif start_time<tmp_info["start_t"] and  end_time>tmp_info["end_t"]:
+                # 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
+                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
+            elif start_time>tmp_info["start_t"] and  end_time<tmp_info["end_t"]:
+                # 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
+                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
+            elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and  end_time>tmp_info["end_t"]:
+                 # 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
+                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
+            if tmp_data is not None:
+                if raw_spectral_data is None:
+                    timestamps=tmp_time_stamp
+                    raw_spectral_data=tmp_data
+                else:
+                    timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
+                    raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
+        return timestamps,raw_spectral_data
+                
+    def _raw_data_2_dataset(self):
+
+        save_dir=os.path.join(self.dataset_dir)
+
+        #第一步，进行特征时间与空间点选取，并保存
+        
+        pre_dataset_save_dir=os.path.join(save_dir,"raw_dataset")
+
+        if not os.path.exists(pre_dataset_save_dir):
+            os.makedirs(pre_dataset_save_dir)
+
+            #数据路径不存在就重新生成，如果存在，就跳过这部分。
+            # ！！！！！！！！！！因此需要额外注意，如果有新数据，需要把dataset下的文件夹都删除，相当于删除缓存，重新生成
+        for i in range(self.raw_labels.shape[0]):
+                
+                start_time,end_time=self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10),self.raw_labels.iloc[i]["TSC_end_time"]
+                timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
+                
+                logging.debug(f"{self.raw_labels.iloc[i]["TSC_start_time"]+datetime.timedelta(seconds=-10)},{self.raw_labels.iloc[i]["TSC_end_time"]}")
+                if raw_spectral_data is not None:
+                    #获取到的数据帧率大于2才开始记录
+                    if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
+                        logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
+                        
+                        # raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
+                        np.savez(os.path.join(pre_dataset_save_dir,f"{self.raw_labels.iloc[i]["Furnace_Number"]}.npz",),furnaceNumber=self.raw_labels.iloc[i]["Furnace_Number"],measureStartDatetime=self.raw_labels.iloc[i]["TSC_start_time"].strftime('%Y-%m-%d %H:%M:%S'),measureEndDatetime=self.raw_labels.iloc[i]["TSC_end_time"].strftime('%Y-%m-%d %H:%M:%S'),timestamps=timestamps,rawSpectralData=raw_spectral_data,rawLabels=self.raw_labels.iloc[i][self.labels_name].to_numpy(),labelNames=["Temperature","C","P","S","Mn","Ni","Mo","Cr"])
+        # else:
+        #     logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
+
+
+if __name__=="__main__":
+
+    logging.basicConfig(level = logging.DEBUG)
+
+
+    raw_data_dir="/data/SEMS-model-training/old_rawdata"
+    labels_path="/data/SEMS-model-training/labels/NanEr/2024-05-15_08-03_2335_.xlsx"
+    dataset_dir="/data/SEMS-model-training/dataset"
+    labels_name=["TSC_T","TSC_C","TSC_P","TSC_S","TSC_Mn","TSC_Ni","TSC_Mo","TSC_Cr"]
+
+
+
+
+    # from choose_frame_spatial.mean import ChooseFrameSpatial
+    # from features_scaling.max_min import FeatureScaling
+    # from labels_scaling.max_min import LabelScaling
+    # choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
+    # features_scaling=FeatureScaling()
+    # labels_scaling=LabelScaling()
+    
+
+
+    data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir)