init repository

2024-09-11 19:15:44 +08:00 · 2024-09-11 19:15:44 +08:00 · cfac5fd675
commit cfac5fd675
40 changed files with 1757 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
 dataset
 rawdata
 labels
 logs
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,5 @@
 # Change Log
 ## v0.1.0.20240910_alpha
 - 跑通
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
 源代码文件在`src`文件夹下。
 ```
 cd src
 python main_train.py
 ```
 pip install hydra-core "ray[tune]" torchinfo pip install h5py tensorboard matplotlib openpyxl
--- a/TODO.md
+++ b/TODO.md
--- a/configs/hydra/default.yaml
+++ b/configs/hydra/default.yaml
@ -0,0 +1,21 @@
 # https://hydra.cc/docs/configure_hydra/intro/
 # enable color logging
 # defaults:
 #   - override hydra_logging: colorlog
 #   - override job_logging: colorlog
 # output directory, generated dynamically on each run
 run:
  dir: ../logs/${task_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
 # sweep:
 #   dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
 #   subdir: ${hydra.job.num}
 job_logging:
  handlers:
    file:
      # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
      filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
 verbose: [__main__,hydra]
--- a/configs/main_train.yaml
+++ b/configs/main_train.yaml
@ -0,0 +1,20 @@
 defaults:
  - _self_
  - hydra: default
  - ray_tune: default
 task_name: main
 raw_spectral_data_dir: /code/admin/20240806-NanEr-5-8-data/rawdata
 raw_labels_dir: /code/admin/20240806-NanEr-5-8-data/labels/NanEr
 dataset_dir: /code/admin/20240806-NanEr-5-8-data/dataset
 labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
 #是
 dataset:
  train_ratio: 0.8
  validate_ratio: 0.1
  num_worker: 128
 train:
  max_epoch: 10000
--- a/configs/ray_tune/default.yaml
+++ b/configs/ray_tune/default.yaml
@ -0,0 +1,51 @@
 run:
  num_samples: 1
  resources_per_trial: 
    cpu: 128
    gpu: 1
 scheduler:
  _target_: ray.tune.schedulers.ASHAScheduler
  metric: val_loss
  mode: min
  max_t: 20000
  grace_period: 1
  reduction_factor: 2
 config:
  choose_frame_spatial:
    _target_: ray.tune.grid_search
    values:
      - _target_: data.choose_frame_spatial.mean.ChooseFrameSpatial
        interval: [-3,0]
  features_scaling:
    _target_: ray.tune.grid_search
    values:
      - _target_: data.features_scaling.max_min.FeatureScaling
  labels_scaling:
    _target_: ray.tune.grid_search
    values:
      - _target_: data.labels_scaling.max_min.LabelScaling
  model:
    _target_: ray.tune.grid_search
    values:
      # - _target_: model.FCNN.FCNN
      # - _target_: model.CNN_LSTM_FCNN.CNN_LSTM_FCNN
      - _target_: model.CNN1D_FCNN.CNN1D_FCNN
  criterion:
    _target_: ray.tune.grid_search
    values:
      - _target_: torch.nn.MSELoss
  optimizer:
    _target_: ray.tune.grid_search
    values:
      - _target_: torch.optim.Adam
        _partial_: true
        lr: 0.0001
  batch_size:
    _target_: ray.tune.grid_search
    values:
      - 128
--- a/src/data/pycache/pre_process.cpython-312.pyc
+++ b/src/data/pycache/pre_process.cpython-312.pyc
--- a/src/data/pycache/spectral_dataset.cpython-312.pyc
+++ b/src/data/pycache/spectral_dataset.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/mean.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/mean.cpython-312.pyc
--- a/src/data/choose_frame_spatial/pycache/mean_var_weight_reduction.cpython-312.pyc
+++ b/src/data/choose_frame_spatial/pycache/mean_var_weight_reduction.cpython-312.pyc
--- a/src/data/choose_frame_spatial/mean.py
+++ b/src/data/choose_frame_spatial/mean.py
@ -0,0 +1,100 @@
 '''
@File    :   mean.py
@Time    :   2024/08/12 13:53:36
@Author  :   Zhanpeng Yang
@Version :   0.0.1
@Contact :   zhpyang@outlook.com
@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
 '''
 import datetime
 import numpy as np
 class ChooseFrameSpatial:
    def __init__(self, interval=[-3,0]):
        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
        Args:
            interval (list, optional): 此方法依赖的光谱时间范围，默认[-30,30]，即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
        """
        self.description=f"{str(self.__class__).split(".")[2]}_{interval[0]}_{interval[1]}"
        # print(self.description)
        self.interval=interval
    def get_data_interval(self, start_time, end_time ):
        return start_time+datetime.timedelta(seconds=self.interval[0]),start_time+datetime.timedelta(seconds=self.interval[1])
    def run(self,timestamps,rawdata):
        ############# 空间数据压缩 ######################
        space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
        space_num= 10  #选取方差最小的点的个数
        light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
                                    & (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
                                    )         
        if light_indices[0].shape[0] <  space_num:
            print('No Enough Usable Space Point Found!')
            return None
        intensity_data = rawdata[:,:,light_indices[0]]
        spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
        top_10_indices = np.argsort(spatial_variance)[:space_num]
        space_selected_data = intensity_data[:, :, top_10_indices]
        ############# 时间数据压缩 ######################   
        # framerate = 24
        # timewindow = 2
        # time_data_intensity = np.sum(np.mean(space_selected_data,axis=2),axis=1)
        # min_var = float('inf')
        # min_index = 0
        # for i in range(len(time_data_intensity)-framerate*timewindow-1):
        #     window_var = np.var(time_data_intensity[i:i+framerate*timewindow])
        #     if window_var < min_var:
        #         min_var = window_var
        #         min_index = i
        # selected_data = space_selected_data[min_index:min_index+framerate*timewindow,:,:]
        selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
        # print("timewindow_begin=",min_index)
        # if (result_dir is not None) and (filenum is not None) :
        #     for i in range (selected_data.shape[2]):
        #         Z=selected_data[:,:,i]
        #         x=np.linspace(400,1000,Z.shape[1])
        #         y=selected_data[:,1,i]
        #         x, y = np.meshgrid(x, y)
        #         fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
        #         surf = ax.plot_surface(x, y, Z, cmap=cm.Blues,
        #                         linewidth=0, antialiased=False)
        #         ax.view_init(elev=30, azim=-60)  # 更改视角
        #         ax.set_zlim(0, 4095)
        #         ax.set_zlabel("Light Intensity")
        #         ax.set_xlabel("Spectral Band(nm)")
        #         ax.set_ylabel("Time (Serial Number)")
        #         plt.savefig(f"{result_dir}{ filenum} file {i}-th spatial_point_line.png")
        #         plt.close()
        return selected_data
 if __name__ =="__main__":
    timpestamp=np.zeros((600,))
    input_data=np.random.random((600,224,512))*4095
    tmp=ChooseFrameSpatial()
    output=tmp.run(timpestamp,input_data)
    print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
--- a/src/data/choose_frame_spatial/mean_var_weight_reduction.py
+++ b/src/data/choose_frame_spatial/mean_var_weight_reduction.py
@ -0,0 +1,132 @@
 '''
@File    :   mean.py
@Time    :   2024/08/12 13:53:36
@Author  :   Zhanpeng Yang
@Version :   0.0.1
@Contact :   zhpyang@outlook.com
@Desc    :   数据第一阶段预处理，从连续光谱数据中选出合适的光谱作为input的特征
 '''
 import datetime
 import numpy as np
 class ChooseFrameSpatial:
    def __init__(self, interval=[-30,30],overexposion_threshold = 1,weight_mean_var = 0.5,space_num= 10, time_num = 9,framerate = 4,timewindow_time =  1,Type=6):
        """注意，算法所需要的超参数均从此处传入，作为类成员，其他函数不允许传入超参数
        Args:
            interval (list, optional): 此方法依赖的光谱时间范围，默认[-30,30]，即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
        """
        # print(str(self.__class__).split("."))
        self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
        # print(self.description)
        self.overexposion_threshold = overexposion_threshold  #过曝阈值：有多少时间点过曝时，判定为这个空间点过曝
        self.weight_mean_var =weight_mean_var  #强度/方差选取权重，默认平均光强权重为1，此权重为方差权重，取值为[0，∞）
        self.space_num= space_num  #选取空间点数量
        self.time_num = time_num  #选取时间点数量
        self.framerate = framerate  #采样帧率
        self.timewindow = framerate * timewindow_time  #选取时间窗口为1s
        self.interval=interval
        self.Type=Type
    def get_data_interval(self, start_time, end_time ):
        return start_time+datetime.timedelta(seconds=self.interval[0]),end_time+datetime.timedelta(seconds=self.interval[1])
    def Overexposed_spatial_point_detection(self,inputdata, timethreshold): 
        #判别一个空间点是否过曝，inputdata是（time，wavelength）二维数组，timethreshold是时间阈值，有多少个时间点过曝则认为这个点过曝
        # 如果过曝返回False，不过曝返回True
        row_max_values= np.max(inputdata, axis=1)
        overexposed_rows = np.sum(row_max_values >= 4095)
        if overexposed_rows > timethreshold:
            return False
        else:
            return True
    def run(self,timestamps,rawdata):
        # 降维方法：空间上选取强度达到一定的阈值且方差最小的点，时间上把整个时间段划分并选取各段权重最高的1s进行平均，Type=1取前1/3，Type=2取中部1/3，Type=3取最后1/3，Type=4取全部
        ############# 空间过曝点去除 ######################
        unoverposed_indices = [i for i in range(rawdata.shape[2]) if self.Overexposed_spatial_point_detection(rawdata[:, :, i],self.overexposion_threshold)]
        rawdata = rawdata[:,:,unoverposed_indices]
        ############# 空间数据压缩 ######################
        space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
        space_intensity_var = np.var(np.sum(rawdata, axis=1), axis=0)
        combined_score = (space_intensity_mean - np.min(space_intensity_mean)) / (np.max(space_intensity_mean) - np.min(space_intensity_mean)) \
            - self.weight_mean_var * (space_intensity_var - np.min(space_intensity_var)) / (np.max(space_intensity_var) - np.min(space_intensity_var)) 
        sorted_indices = np.argsort(combined_score)[::-1]
        space_selected_data = rawdata[:,:,sorted_indices[:self.space_num]]
        ############# 时间数据压缩 ######################
        space_selected_data_intensity = np.sum(space_selected_data,axis=1)
        #按照时间把强度数据拆分成num_time份，每一份维度为（总时间点数/time_num, space_num）
        time_length = space_selected_data_intensity.shape[0]
        chunk_size = time_length // self.time_num
        remainder = time_length % self.time_num
        start_index = 0
        time_selected_data =None
        for i in range(self.time_num):
            end_index = start_index + chunk_size + (1 if i < remainder else 0)
            chunk = space_selected_data_intensity[start_index:end_index, :]        
            window_var = np.empty(0)
            window_mean = np.empty(0)
            for j in range(len(chunk)-self.timewindow-1):
                window_var = np.concatenate((window_var, np.expand_dims( np.sum(np.var(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
                window_mean = np.concatenate((window_mean, np.expand_dims( np.sum(np.mean(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)      
            combined_score_time = (window_mean - np.min(window_mean)) / (np.max(window_mean) - np.min(window_mean)) \
                - self.weight_mean_var * (window_var - np.min(window_var)) / (np.max(window_var) - np.min(window_var)) 
            sorted_indices = np.argsort(combined_score_time)[::-1]
            time_window_data = np.mean(space_selected_data[start_index+sorted_indices[0]:start_index+sorted_indices[0]+self.timewindow, :, :],axis=0)
            # print(time_selected_data.shape,time_window_data.shape,np.expand_dims( time_window_data,0).shape)
            if time_selected_data is None:
                time_selected_data=np.expand_dims( time_window_data,0)
            else:
                time_selected_data = np.concatenate((time_selected_data, np.expand_dims( time_window_data,0)), axis=0)
            start_index = end_index
        if self.Type == 1:
            print("time_selected_data[前1/3].shape: ", time_selected_data[:self.time_num//3,:,:].shape)
            return time_selected_data[:self.time_num//3,:,:]
        elif self.Type == 2:
            print("time_selected_data[中1/3].shape: ", time_selected_data[self.time_num//3:2*self.time_num//3,:,:].shape)
            return time_selected_data[self.time_num//3:2*self.time_num//3,:,:]
        elif self.Type == 3:
            print("time_selected_data[后1/3].shape: ", time_selected_data[2*self.time_num//3:,:,:].shape)
            return time_selected_data[2*self.time_num//3:,:,:]
        elif self.Type == 4:
            print("time_selected_data[前2/3].shape: ", time_selected_data[:2*self.time_num//3,:,:].shape)
            return time_selected_data[:2*self.time_num//3,:,:]
        elif self.Type == 5:
            print("time_selected_data[后2/3].shape: ", time_selected_data[self.time_num//3:,:,:].shape)
            return time_selected_data[self.time_num//3:,:,:]
        elif self.Type == 6:
            print("time_selected_data.shape: ", time_selected_data.shape)
            return time_selected_data
        else:
            print("Type is not 1, 2, 3, 4, 5 or 6 !")
            return None
 if __name__ =="__main__":
    timpestamp=np.zeros((600,))
    input_data=np.random.random((600,224,512))*4095
    tmp=ChooseFrameSpatial()
    output=tmp.run(timpestamp,input_data)
    print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")
--- a/src/data/features_scaling/pycache/max_min.cpython-312.pyc
+++ b/src/data/features_scaling/pycache/max_min.cpython-312.pyc
--- a/src/data/features_scaling/max_min.py
+++ b/src/data/features_scaling/max_min.py
@ -0,0 +1,30 @@
 '''
@File    :   max_min.py
@Time    :   2024/08/12 14:02:59
@Author  :   Zhanpeng Yang
@Version :   0.0.1
@Contact :   zhpyang@outlook.com
@Desc    :   对输入特征进行标准化放缩
 '''
 import numpy as np
 class FeatureScaling:
    def __init__(self):
        self.description=f"{str(self.__class__).split(".")[-2]}"
        self.flag_get_global_info=True
        self._min=999999999
        self._max=0
    def get_global_info(self,feature):
        tmp_max=np.max(feature)
        tmp_min=np.min(feature)
        if tmp_max>self._max:
            self._max=tmp_max
        if tmp_min<self._min:
            self._min=tmp_min
    def run( self,feature):
        feature=(feature-self._min)/(self._max-self._min)
        return feature
--- a/src/data/labels_scaling/pycache/max_min.cpython-312.pyc
+++ b/src/data/labels_scaling/pycache/max_min.cpython-312.pyc
--- a/src/data/labels_scaling/max_min.py
+++ b/src/data/labels_scaling/max_min.py
@ -0,0 +1,45 @@
 '''
@File    :   max_min.py
@Time    :   2024/08/12 14:03:38
@Author  :   Zhanpeng Yang
@Version :   0.0.1
@Contact :   zhpyang@outlook.com
@Desc    :   对输出标签进行标准化放缩
 '''
 import numpy as np
 class LabelScaling:
    def __init__(self):
        self.description=f"{str(self.__class__).split(".")[-2]}"
        self.flag_get_global_info=True
        self._min=None
        self._max=None
    def get_global_info(self,label):
        label_dim=label.shape[0]
        if self._max is  None:
            self._min=np.zeros((label_dim))+999999999
            self._max=np.zeros((label_dim))
        for i in range(label_dim):
            # tmp_max=np.max(label)
            # tmp_min=np.min(label)
            if label[i]>self._max[i]:
                self._max[i]=label[i]
            if label[i]<self._min[i]:
                self._min[i]=label[i]
    def run(self,label):
        for i in range(label.shape[0]):
            label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
        return  label
    def reverse(self,labels):
        for i in range(labels.shape[1]):
            labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
        return labels
--- a/src/data/pre_process.py
+++ b/src/data/pre_process.py
@ -0,0 +1,373 @@
 '''
@File    :   pre_process.py
@Time    :   2024/08/09 13:54:28
@Author  :   Zhanpeng Yang
@Version :   0.0.1
@Contact :   zhpyang@outlook.com
@Desc    :   进行数据预处理
 '''
 import pandas as pd
 import logging
 import os,glob
 import numpy as np
 import datetime
 import pickle
 import h5py
 class DataPreProcess:
    """
    包含所有数据预处理操作，包括
    1. 从原始二进制数据，以钢厂 提供的Excel文件中提取出目标时刻附近的光谱
    2. 从目标时刻附近的光谱选择合适、稳定的时间及空间点的光谱，作为feature
    3. 将feature 与 Label进行放缩
    """
    def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,choose_frame_spatial, features_scaling,labels_scaling,train_ratio=0.8,validate_ratio=0.1) -> None:
        """初始化类，传入所有数据预处理需要的参数
        Args:
            raw_spectral_data_dir (_type_):原始光谱数据路径
            raw_labels_dir (_type_): 原始标签路径
            labels_name (_type_): 提取出的标签名
            dataset_dir (_type_): 生成的数据集文件夹
            choose_frame_spatial (_type_): 类对象，进行光谱特征挑选
            features_scaling (_type_): 类对象，对输入的特征进行放缩，使之直接输入神经网络
            labels_scaling (_type_): 类对象，对输出的的标签进行放缩
            train_ratio (float, optional): 训练集比例. Defaults to 0.8.
            validate_ratio (float, optional): 验证集比例，剩余的即为测试集. Defaults to 0.1.
        """
        self.raw_spectral_data_dir=raw_spectral_data_dir
        self.raw_labels_dir=raw_labels_dir
        self.dataset_dir=dataset_dir
        self.labels_name=labels_name
        self.choose_frame_spatial=choose_frame_spatial
        self.features_scaling=features_scaling
        self.labels_scaling=labels_scaling
        self.train_ratio=train_ratio
        self.validate_ratio=validate_ratio
        #加载原始的标签
        self.raw_labels=self._load_raw_labels(raw_labels_dir,labels_name)
        #加载原始光谱的缓存
        self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
        #随便加载一个csv文件，判断光谱数据的维度
        self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
        #正式开始原始数据转化为数据集
        self._raw_data_2_dataset()
    def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
        """读取利用脚本处理后的钢厂给的excel文件所在文件夹（会扫描所有文件）
        并选择指定的label名作为output
        并去除值为NaN或0的行
        Args:
            raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
            labels_name (list): 指定的作为Label的列
        Returns:
            pd.DataFrame: 返回所有筛选后的炉次数据
        """
        raw_labels=None
        for name in   os.listdir(raw_labels_dir):  
            tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
            choosed_column=["TSC_start_time","TSC_end_time"]
            choosed_column=choosed_column+labels_name
            #只选出我们想要的部分作为标签
            tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
            # 选出有NULL的行
            null_rows=tmp_raw_labels.isnull().any(axis=1)
            #  # 选出有0的行
            zeros_rows=(tmp_raw_labels==0).any(axis=1)  | (tmp_raw_labels=='0').any(axis=1)
            # # 每一行但凡有NULL或者0都给删了
            selected_rows=~(null_rows|zeros_rows)
            tmp_raw_labels=tmp_raw_labels[selected_rows]
            if raw_labels is None:
                raw_labels=tmp_raw_labels
            else:
                raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
            logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
        logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
        return raw_labels
    def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
        """生成所有原始光谱数据文件的缓存，包括每个文件记录的开始及结束时间，目的为加快后面读取原始数据的速度
        Args:
            raw_spectral_data_dir (str): 原始光谱所在路径
        Returns:
            list: 缓存，其中有多少个成员，就有多少个原始数据文件，每个成员包括格式为datetime的开始及结束时间，及文件路径
        """
        spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
        cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
        update_flag=False
        #不存在缓存文件，以及缓存文件中数据文件个数与文件夹中的文件个数不一致，均重新生成
        if len(cache_file_paths)==0:
            logging.debug(f"Raw spectral data cache is not existed! Generating")
            update_flag=True
        elif len(cache_file_paths)==1:
            with open(cache_file_paths[0],"rb") as f:
                raw_spectral_data_cache=pickle.load(f)
            if len(raw_spectral_data_cache) !=len(spectral_file_paths):
                logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
                update_flag=True
        else:
            logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
        if update_flag:
            spectral_file_paths.sort()
            raw_spectral_data_cache=[]
            for file in spectral_file_paths:
                tmp_info={}
                tmp_data=np.loadtxt(file, delimiter=",")
                start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
                end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
                tmp_info["start_t"]=start_t
                tmp_info["end_t"]=end_t
                tmp_info["file_path"]=file
                raw_spectral_data_cache.append(tmp_info)
            with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
                pickle.dump(raw_spectral_data_cache,f)
        return raw_spectral_data_cache
    def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
        data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
        if data[0,2]==229376:
            spectral_dim =224
            spatial_dim=512
        if data[0,2]==917504:
            spectral_dim =448
            spatial_dim=1024
        return spectral_dim, spatial_dim
    def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
        """获取从start_time到end_time的光谱数据
        Args:
            start_time (datetime.datetime): 开始时间
            end_time (datetime.datetime): 结束时间
        Returns:
            np.ndarray: 原始光谱数据
        """
        def get_spectral_data_per_file(file_path,s_t,e_t):
            data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
            if s_t is not None:
                tmp_s=datetime.datetime.timestamp(s_t)*1000
                tmp_s_index=0
                for i in range(data.shape[0]-1):
                    if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
                        tmp_s_index=i
                        break
            else:
                tmp_s_index=0
            if e_t is not None:
                tmp_e=datetime.datetime.timestamp(e_t)*1000
                tmp_e_index=data.shape[0]
                for i in range(tmp_s_index,data.shape[0]-1):
                    if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
                        tmp_e_index=i
                        break
            else:
                tmp_e_index=data.shape[0]
            with open(file_path[:-3]+"bin", "rb") as f:
                f.seek(data[tmp_s_index,1])
                d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
                d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
            return data[tmp_s_index:tmp_e_index,0],d
        timestamps=None
        raw_spectral_data=None
        for tmp_info in self.raw_spectral_data_cache:
            tmp_data=None
            if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and  end_time<tmp_info["end_t"]:
             # 目标时间段在交于此文件时间段的左侧。所以取从文件开始到，end time的数据
                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
            elif start_time<tmp_info["start_t"] and  end_time>tmp_info["end_t"]:
                # 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
            elif start_time>tmp_info["start_t"] and  end_time<tmp_info["end_t"]:
                # 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
            elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and  end_time>tmp_info["end_t"]:
                 # 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
                tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
            if tmp_data is not None:
                if raw_spectral_data is None:
                    timestamps=tmp_time_stamp
                    raw_spectral_data=tmp_data
                else:
                    timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
                    raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
        return timestamps,raw_spectral_data
    def _raw_data_2_dataset(self):
        save_dir=os.path.join(self.dataset_dir,self.choose_frame_spatial.description)
        #第一步，进行特征时间与空间点选取，并保存
        pre_dataset_save_dir=os.path.join(save_dir,"data")
        if not os.path.exists(pre_dataset_save_dir):
            os.makedirs(pre_dataset_save_dir)
            #数据路径不存在就重新生成，如果存在，就跳过这部分。
            # ！！！！！！！！！！因此需要额外注意，如果有新数据，需要把dataset下的文件夹都删除，相当于删除缓存，重新生成
            for i in range(self.raw_labels.shape[0]):
                start_time,end_time=self.choose_frame_spatial.get_data_interval(self.raw_labels.iloc[i]["TSC_start_time"],self.raw_labels.iloc[i]["TSC_end_time"])
                timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
                if raw_spectral_data is not None:
                    #获取到的数据帧率大于2才开始记录
                    if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
                        logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
                        raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
                        np.savez(os.path.join(pre_dataset_save_dir,f"{timestamps[0]}_{timestamps.shape[0]}.npz",),timestamps=timestamps,raw_spectral_data=raw_spectral_data,raw_labels=self.raw_labels.iloc[i][self.labels_name].to_numpy())
        else:
            logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
        #第二步，进行标准化，并形成数据集
        self.dataset_save_dir=os.path.join(save_dir,f"{self.features_scaling.description}_{self.labels_scaling.description}")
        if not os.path.exists(self.dataset_save_dir):
            os.makedirs(self.dataset_save_dir)
            pre_dataset_files=os.listdir(pre_dataset_save_dir)
            if self.features_scaling.flag_get_global_info:
                for name in pre_dataset_files:
                    tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
                    self.features_scaling.get_global_info(tmp_data["raw_spectral_data"])
                    if self.labels_scaling.flag_get_global_info:
                        self.labels_scaling.get_global_info(tmp_data["raw_labels"])
            # 获取维度信息
            tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
            feature_dim=tmp_data["raw_spectral_data"].shape
            label_dim=tmp_data["raw_labels"].shape
            #划分训练集_验证集_测试集，
            np.random.shuffle(pre_dataset_files)
            [train_dateset_files,validate_dateset_files]=np.split(pre_dataset_files,[int(len(pre_dataset_files)*self.train_ratio)]) 
            [validate_dateset_files,test_dateset_files]=np.split(validate_dateset_files,[int(len(pre_dataset_files)*self.validate_ratio)]) 
            #写入HDF5文件
            for dataset_name in ["train","validate","test"]:
                if dataset_name=="train":
                        file_names=train_dateset_files
                elif  dataset_name=="validate":
                        file_names=validate_dateset_files
                elif  dataset_name=="test":
                        file_names=test_dateset_files
                logging.info(f"Generating {dataset_name} dataset with {len(file_names)} samples")
                with h5py.File(os.path.join( self.dataset_save_dir,f"{dataset_name}.h5"), 'w') as f:
                    h5_features = f.create_dataset(
                    'features',
                    tuple([len(file_names)]+list(feature_dim)),
                    maxshape=(tuple([None]+list(feature_dim))),
                        chunks=tuple([1]+list(feature_dim)), # 手动设置 chunk 大小为一次数据的大小tuple([1]+list(feature_dim))
                        # compression='gzip',
                        # compression_opts=9,
                        dtype=np.float32,
                    )
                    h5_labels = f.create_dataset(
                    'labels',
                    tuple([len(file_names)]+list(label_dim)),
                        chunks=tuple([1]+list(label_dim)), # 手动设置 chunk 大小为一次数据的大小
                        dtype=np.float32,
                    )
                    for i,name in enumerate(file_names):
                        tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
                        feature=self.features_scaling.run(tmp_data["raw_spectral_data"])
                        label=self.labels_scaling.run(tmp_data["raw_labels"])
                        h5_features[i]=feature.astype(np.float32)
                        h5_labels[i]=label.astype(np.float32)
        else:
            pre_dataset_files=os.listdir(pre_dataset_save_dir)
            if self.labels_scaling.flag_get_global_info:
                for name in pre_dataset_files:
                    tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
                    self.labels_scaling.get_global_info(tmp_data["raw_labels"])
            logging.info(f"Standardized Dataset is existed in {self.dataset_save_dir}")
    def get_metric(self,outputs,labels):
        outputs=self.labels_scaling.reverse(outputs)
        labels=self.labels_scaling.reverse(labels)
        error=outputs-labels
        # print("outputs",outputs)
        # print("labels",labels)
        # print("errors",error)
        hit_rate=np.zeros(error.shape[1])
        for i,name in enumerate(self.labels_name):
            # error[:,i]=outputs[:,i]-labels[:,i]
            #["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
            if name =="TSC_T":
                bound=10
            elif name=="TSC_C":
                bound=0.05
            elif name=="TSC_C_lab":
                bound=0.05
            elif name=="TSC_P_lab":
                bound=0.0005
            hit_rate[i]=((error[:,i]>=-bound) &(error[:,i]<=bound)).sum() /error.shape[0]
        return error,hit_rate
 if __name__=="__main__":
    logging.basicConfig(level = logging.DEBUG)
    raw_data_dir="/code/admin/20240806-NanEr-5-8-data/rawdata"
    labels_path="/code/admin/20240806-NanEr-5-8-data/labels/NanEr"
    dataset_dir="/code/admin/20240806-NanEr-5-8-data/dataset"
    labels_name=["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
    from choose_frame_spatial.mean import ChooseFrameSpatial
    from features_scaling.max_min import FeatureScaling
    from labels_scaling.max_min import LabelScaling
    choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
    features_scaling=FeatureScaling()
    labels_scaling=LabelScaling()
    data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir,choose_frame_spatial,features_scaling,labels_scaling)
--- a/src/data/spectral_dataset.py
+++ b/src/data/spectral_dataset.py
@ -0,0 +1,50 @@
 import torch
 import h5py
 import os
 import time
 def load_dataset(dataset_dir):
    train_dataset=SpectralDataset(os.path.join(dataset_dir,"train.h5"))
    val_dataset=SpectralDataset(os.path.join(dataset_dir,"validate.h5"))
    return train_dataset, val_dataset
 class SpectralDataset(torch.utils.data.Dataset):
    def __init__(self, hdf5_path):
        self.hdf5_path=hdf5_path
        self.hdf5_f = h5py.File(hdf5_path, 'r',
                    # rdcc_nbytes=1024**2*1024*10,  # 10G 缓存总大小，单位字节。应设置为 chunk 的整数倍
                    # rdcc_w0=0,  # 缓存淘汰策略。值越接近 0，优先淘汰最近最少使用的 chunk。值越接近 1，优先淘汰已完全读取或写入的块
                    # rdcc_nslots=1e8,  # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍，为达最佳性能需要 100 倍。默认为 521
        )
        self.features=self.hdf5_f["features"]
        self.labels=self.hdf5_f["labels"]
    def __len__(self):
        return self.features.shape[0]
    def __getitem__(self, idx):
        #构造Input
        feature=self.features[idx]
        label=self.labels[idx]
        return feature, label
 if __name__ =="__main__":
    training_data=SpectralDataset("/home/admin/20240806-NanEr-5-8-data/dataset/mean_-30_30/max_min_max_min/train.h5")
    print("数据集大小为：",len(training_data))
    print("输入Feature维度为：", training_data[0][0].shape, "输出Label维度为：",training_data[0][1].shape)
    from torch.utils.data import DataLoader
    train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
    train_features, train_labels = next(iter(train_dataloader))
    print(f"Feature batch shape: {train_features.size()}")
    print(f"Labels batch shape: {train_labels.size()}")
--- a/src/inference/CNN1D_FCNN.py
+++ b/src/inference/CNN1D_FCNN.py
@ -0,0 +1,78 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torchinfo 
 import torch
 class CNN1D_FCNN(nn.Module):
    def __init__(self,):
        super(CNN1D_FCNN, self).__init__()
        self.Conv2d_1=nn.Conv1d(in_channels=1, out_channels=256, kernel_size=3,stride=1)
        self.Conv2d_2=nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3,stride=1)
        self.MaxPool1d_1=nn.MaxPool1d(kernel_size=2, stride=2)
        self.Conv2d_3=nn.Conv1d(in_channels=512, out_channels=256, kernel_size=3,stride=1)
        self.Conv2d_4=nn.Conv1d(in_channels=256, out_channels=32, kernel_size=3,stride=1)
        self.Conv2d_5=nn.Conv1d(in_channels=32, out_channels=3, kernel_size=2,stride=1)
        self.flatten=nn.Flatten(start_dim=1, end_dim=2)
        # self.fc_1=nn.Linear(in_features=645,out_features=1024)  #2048->1024
        # self.fc_2=nn.Linear(in_features=1024,out_features=512)
        self.fc_3=nn.Linear(in_features=315,out_features=256)
        self.fc_4=nn.Linear(in_features=256,out_features=128)
        self.fc_5=nn.Linear(in_features=128,out_features=4)
        # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) 
    def forward(self, x):
        # input_shape=x.shape
        x=torch.mean(x,dim=1,keepdim=True)
        x=torch.mean(x,dim=3)
        print(x.shape)
        # x=torch.unsqueeze(x, 1)
        x=F.tanh(self.Conv2d_1(x))
        x=F.tanh(self.Conv2d_2(x))
        x=self.MaxPool1d_1(x)
        x=F.tanh(self.Conv2d_3(x))
        x=F.tanh(self.Conv2d_4(x))
        x=F.tanh(self.Conv2d_5(x))
        x=self.flatten(x)
        # x=F.tanh(self.fc_1(x))
        # x=F.tanh(self.fc_2(x))
        x=F.tanh(self.fc_3(x))
        x=F.tanh(self.fc_4(x))
        x=F.sigmoid(self.fc_5(x))
        # 
        # x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
        # x=nn.Flatten(start_dim=2, end_dim=4)(x)
        # x,_=self.lstm_1(x)
        # x=x[:,-1,:]
        # x=nn.LeakyReLU()(self.fc_1(x))
        # x=nn.LeakyReLU()(self.fc_2(x))
        # x=nn.Sigmoid()(self.fc_3(x))
        return x
 if __name__=="__main__":
    model=CNN1D_FCNN()
    torchinfo.summary(model,input_size=(64, 48, 224, 10))
--- a/src/inference/pycache/CNN1D_FCNN.cpython-312.pyc
+++ b/src/inference/pycache/CNN1D_FCNN.cpython-312.pyc
--- a/src/inference/pycache/main.cpython-312.pyc
+++ b/src/inference/pycache/main.cpython-312.pyc
--- a/src/inference/pycache/test.cpython-312.pyc
+++ b/src/inference/pycache/test.cpython-312.pyc
--- a/src/inference/main.py
+++ b/src/inference/main.py
@ -0,0 +1 @@
 np
--- a/src/inference/super_resolution.onnx
+++ b/src/inference/super_resolution.onnx
--- a/src/inference/test.http
+++ b/src/inference/test.http
@ -0,0 +1,12 @@
 GET http://localhost:8000
 ###
 POST http://localhost:8000/inference HTTP/1.1
 Content-Type: application/json
 {
    "type": "a",
    "a":"c"
 }
--- a/src/inference/test.py
+++ b/src/inference/test.py
@ -0,0 +1,17 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 app = FastAPI()
@app.get("/")
 async def root():
    return {"message": "Hello World"}
 class Item(BaseModel):
    type: str
@app.post("/inference/")
 async def inference(data:Item):
    print(data)
    return {"message": "Hello World"}
--- a/src/main_train.py
+++ b/src/main_train.py
@ -0,0 +1,56 @@
 import hydra
 import omegaconf   # DictConfig, OmegaConf
 import logging
 import ray,os
 import ray.tune
 from functools import partial
 logger = logging.getLogger(__name__)
 from optimizer.trainable import trainable
 #引入系统配置文件
@hydra.main(version_base=None, config_path="../configs", config_name="main_train.yaml")
 def main(hydra_cfg : omegaconf.DictConfig) -> None:
    logger.info(f"**************Hydra Configs**************\n{omegaconf.OmegaConf.to_yaml(hydra_cfg)}")
    #获取当前保存文件夹
    hydra_output_dir = hydra.core.hydra_config.HydraConfig.get()
    hydra_output_dir=hydra_output_dir['runtime']['output_dir']
    #获取当前超参数配置
    tune_cfg={}
    for key in hydra_cfg.ray_tune.config.keys():
        tune_cfg[key]=hydra.utils.instantiate(hydra_cfg.ray_tune.config[key],_recursive_=False)
    #调度器
    scheduler=hydra.utils.instantiate(hydra_cfg.ray_tune.scheduler)
    result=ray.tune.run(
        partial(trainable,hydra_cfg),
        resources_per_trial=omegaconf.OmegaConf.to_container(hydra_cfg.ray_tune.run.resources_per_trial),
        config=tune_cfg,
        num_samples=hydra_cfg.ray_tune.run.num_samples,
        scheduler=scheduler,
        storage_path=hydra_output_dir,
        log_to_file=os.path.join(hydra_output_dir,"ray-tune.log"),
        name="ray-tune",
        verbose=1
    )
    #保存最好的模型
    best_trial = result.get_best_trial("val_loss", "min", "last")
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final validation loss: {best_trial.last_result['val_loss']}")
 if __name__=="__main__":
    main()
--- a/src/model/CNN1D_FCNN.py
+++ b/src/model/CNN1D_FCNN.py
@ -0,0 +1,78 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torchinfo 
 import torch
 class CNN1D_FCNN(nn.Module):
    def __init__(self,):
        super(CNN1D_FCNN, self).__init__()
        self.Conv2d_1=nn.Conv1d(in_channels=1, out_channels=1024, kernel_size=3,stride=1)
        self.Conv2d_2=nn.Conv1d(in_channels=1024, out_channels=2048, kernel_size=3,stride=1)
        self.MaxPool1d_1=nn.MaxPool1d(kernel_size=2, stride=2)
        self.Conv2d_3=nn.Conv1d(in_channels=2048, out_channels=1024, kernel_size=3,stride=1)
        self.Conv2d_4=nn.Conv1d(in_channels=1024, out_channels=256, kernel_size=3,stride=1)
        self.Conv2d_5=nn.Conv1d(in_channels=256, out_channels=3, kernel_size=2,stride=1)
        self.flatten=nn.Flatten(start_dim=1, end_dim=2)
        # self.fc_1=nn.Linear(in_features=645,out_features=1024)  #2048->1024
        # self.fc_2=nn.Linear(in_features=1024,out_features=512)
        self.fc_3=nn.Linear(in_features=315,out_features=256)
        self.fc_4=nn.Linear(in_features=256,out_features=128)
        self.fc_5=nn.Linear(in_features=128,out_features=4)
        # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) 
    def forward(self, x):
        # input_shape=x.shape
        # x=torch.mean(x,dim=1,keepdim=True)
        # x=torch.mean(x,dim=3)
        # print(x.shape)
        x=torch.unsqueeze(x, 1)
        x=F.tanh(self.Conv2d_1(x))
        x=F.tanh(self.Conv2d_2(x))
        x=self.MaxPool1d_1(x)
        x=F.tanh(self.Conv2d_3(x))
        x=F.tanh(self.Conv2d_4(x))
        x=F.tanh(self.Conv2d_5(x))
        x=self.flatten(x)
        # x=F.tanh(self.fc_1(x))
        # x=F.tanh(self.fc_2(x))
        x=F.tanh(self.fc_3(x))
        x=F.tanh(self.fc_4(x))
        x=F.sigmoid(self.fc_5(x))
        # 
        # x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
        # x=nn.Flatten(start_dim=2, end_dim=4)(x)
        # x,_=self.lstm_1(x)
        # x=x[:,-1,:]
        # x=nn.LeakyReLU()(self.fc_1(x))
        # x=nn.LeakyReLU()(self.fc_2(x))
        # x=nn.Sigmoid()(self.fc_3(x))
        return x
 if __name__=="__main__":
    model=CNN1D_FCNN()
    torchinfo.summary(model,input_size=(64, 224))
--- a/src/model/CNN_FCNN.py
+++ b/src/model/CNN_FCNN.py
@ -0,0 +1,76 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torchinfo 
 import torch
 class CNN_FCNN(nn.Module):
    def __init__(self,):
        super(CNN_FCNN, self).__init__()
        self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=512, kernel_size=3,stride=1)
        self.Conv2d_2=nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3,stride=1)
        # self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2)
        self.Conv2d_3=nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3,stride=1)
        self.Conv2d_4=nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3,stride=1)
        self.Conv2d_5=nn.Conv2d(in_channels=64, out_channels=6, kernel_size=2,stride=1)
        self.flatten=nn.Flatten(start_dim=1, end_dim=3)
        self.fc_1=nn.Linear(in_features=1290,out_features=2048)  #2048->1024
        self.fc_2=nn.Linear(in_features=2048,out_features=1024)
        self.fc_3=nn.Linear(in_features=1024,out_features=512)
        self.fc_4=nn.Linear(in_features=512,out_features=256)
        self.fc_5=nn.Linear(in_features=256,out_features=4)
        # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) 
    def forward(self, x):
        # input_shape=x.shape
        x=torch.mean(x,dim=1,keepdim=True)
        # x=torch.unsqueeze(x, 1)
        x=F.tanh(self.Conv2d_1(x))
        x=F.tanh(self.Conv2d_2(x))
        # x=self.MaxPool2d_1(x)
        x=F.tanh(self.Conv2d_3(x))
        x=F.tanh(self.Conv2d_4(x))
        x=F.tanh(self.Conv2d_5(x))
        x=self.flatten(x)
        x=F.tanh(self.fc_1(x))
        x=F.tanh(self.fc_2(x))
        x=F.tanh(self.fc_3(x))
        x=F.tanh(self.fc_4(x))
        x=F.sigmoid(self.fc_5(x))
        # 
        # x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
        # x=nn.Flatten(start_dim=2, end_dim=4)(x)
        # x,_=self.lstm_1(x)
        # x=x[:,-1,:]
        # x=nn.LeakyReLU()(self.fc_1(x))
        # x=nn.LeakyReLU()(self.fc_2(x))
        # x=nn.Sigmoid()(self.fc_3(x))
        return x
 if __name__=="__main__":
    model=CNN_FCNN()
    torchinfo.summary(model,input_size=(64, 48, 224, 10))
--- a/src/model/CNN_LSTM_FCNN.py
+++ b/src/model/CNN_LSTM_FCNN.py
@ -0,0 +1,69 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torchinfo 
 import torch
 class CNN_LSTM_FCNN(nn.Module):
    def __init__(self,):
        super(CNN_LSTM_FCNN, self).__init__()
        # self.reshape_1=nn.Flatten(start_dim=0, end_dim=1)
        self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=256, kernel_size=3,stride=1)
        self.Conv2d_2=nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3,stride=1)
        self.Conv2d_3=nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3,stride=1)
        self.Conv2d_4=nn.Conv2d(in_channels=256, out_channels=32, kernel_size=3,stride=1)
        self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2)
        self.lstm_1=nn.LSTM(input_size=3456, hidden_size=2048,num_layers=2, batch_first=True)
        self.fc_1=nn.Linear(in_features=2048,out_features=1024)  #2048->1024
        self.fc_2=nn.Linear(in_features=1024,out_features=256)
        self.fc_3=nn.Linear(in_features=256,out_features=4)
        # self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601)) 
    def forward(self, x):
        input_shape=x.shape
        x=nn.Flatten(start_dim=0, end_dim=1)(x)
        x=torch.unsqueeze(x, 1)
        x=self.Conv2d_1(x)
        x=self.Conv2d_2(x)
        x=self.Conv2d_3(x)
        x=self.Conv2d_4(x)
        x=self.MaxPool2d_1(x)
        x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
        x=nn.Flatten(start_dim=2, end_dim=4)(x)
        x,_=self.lstm_1(x)
        x=x[:,-1,:]
        x=nn.LeakyReLU()(self.fc_1(x))
        x=nn.LeakyReLU()(self.fc_2(x))
        x=nn.Sigmoid()(self.fc_3(x))
        return x
 if __name__=="__main__":
    model=CNN_LSTM_FCNN()
    torchinfo.summary(model,input_size=(64, 48, 224, 10))
--- a/src/model/FCNN.py
+++ b/src/model/FCNN.py
@ -0,0 +1,42 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torchinfo 
 class FCNN(nn.Module):
    def __init__(self,):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(48* 224* 10, 4)
        # self.fc2 = nn.Linear(40960, 10240)
        # self.fc3 = nn.Linear(10240, 4096)
        # self.fc4 = nn.Linear(4096, 2048)
        # self.fc5 = nn.Linear(2048, 1024)
        # self.fc6 = nn.Linear(1024, 512)
        # self.fc7 = nn.Linear(512, 256)
        # self.fc8 = nn.Linear(256, 128)
        self.fc9 = nn.Linear(128, 4)
        # self.fc4 = nn.Linear(10240, 4)
    def forward(self, x):
        x=x.reshape(x.shape[0],-1)
        x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        # x = F.relu(self.fc3(x))
        # x = F.relu(self.fc4(x))
        # x = F.relu(self.fc5(x))
        # x = F.relu(self.fc6(x))
        # x = F.relu(self.fc7(x))
        # x = F.relu(self.fc8(x))
        # x = F.sigmoid(self.fc9(x))
        # x = F.relu(self.fc2(x))
        # x = F.relu(self.fc3(x))
        # x = F.relu(self.fc4(x))
        return x
 if __name__=="__main__":
    model=FCNN()
    torchinfo.summary(model,input_size=(64, 48, 224, 10))
--- a/src/model/pycache/CNN1D_FCNN.cpython-312.pyc
+++ b/src/model/pycache/CNN1D_FCNN.cpython-312.pyc
--- a/src/model/pycache/CNN_FCNN.cpython-312.pyc
+++ b/src/model/pycache/CNN_FCNN.cpython-312.pyc
--- a/src/model/pycache/CNN_LSTM_FCNN.cpython-312.pyc
+++ b/src/model/pycache/CNN_LSTM_FCNN.cpython-312.pyc
--- a/src/model/pycache/FCNN.cpython-312.pyc
+++ b/src/model/pycache/FCNN.cpython-312.pyc
--- a/src/optimizer/pycache/trainable.cpython-312.pyc
+++ b/src/optimizer/pycache/trainable.cpython-312.pyc
--- a/src/optimizer/trainable.py
+++ b/src/optimizer/trainable.py
@ -0,0 +1,246 @@
 import ray.train
 import ray.tune
 import ray.tune.logger
 from data.pre_process import DataPreProcess
 from data.spectral_dataset import load_dataset
 import ray
 import hydra
 import random
 import logging
 import torch
 import pickle
 import pathlib
 import tempfile
 import time,os
 from torch.utils.tensorboard import SummaryWriter
 import numpy as np
 import matplotlib.pyplot as plt
 logger = logging.getLogger("ray")
 def trainable(hydra_cfg,tune_cfg):
    """
    tune_cfg为ray tune 传进来的可变的配置
    hydra_cfg为Hydra传进来的所有配置
    """
    #构建tensorboard日志
    writer=SummaryWriter(log_dir=os.getcwd().replace("working_dirs","driver_artifacts"))
    ############################
    ##  原始数据预处理为数据集  ##
    ############################
    t_1=time.time()
    data_preprocess=DataPreProcess( hydra_cfg.raw_spectral_data_dir,hydra_cfg.raw_labels_dir,hydra_cfg.labels_name,hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]),hydra_cfg.dataset.train_ratio,hydra_cfg.dataset.validate_ratio)
    t_2=time.time()
    logger.info(f"Preprocessed raw data costs {t_2-t_1}s")
    ############################
    ##  加载数据集为dataloader ##
    ############################
    train_dataset,val_dataset=load_dataset(data_preprocess.dataset_save_dir)
    trainloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
    )
    valloader = torch.utils.data.DataLoader(
        val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
    )
    t_3=time.time()
    logger.info(f"Dataloader costs {t_3-t_2}s")
    # 确认训练设备
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    ################################
    ##  模型，优化起，损失函数初始化 ##
    ################################
    model=hydra.utils.instantiate(tune_cfg["model"])
    model.to(device)
    criterion=hydra.utils.instantiate(tune_cfg["criterion"])
    optimizer=hydra.utils.instantiate(tune_cfg["optimizer"])
    optimizer=optimizer(model.parameters())
    t_4=time.time()
    logger.info(f"Initilized model costs {t_4-t_3}s")
    #BUG,暂不可用
    ################################
    ##  加载未训练完成的checkpoint ##
    ################################
    checkpoint = ray.train.get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as checkpoint_dir:
            data_path =     pathlib.Path() / "data.pkl"
            with open(data_path, "rb") as fp:
                checkpoint_state = pickle.load(fp)
            start_epoch = checkpoint_state["epoch"]
            model.load_state_dict(checkpoint_state["net_state_dict"])
            optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0
    t_5=time.time()
    logger.info(f"Check checkpoint costs {t_5-t_4}s")
    ################################
    ##  生成模型架构图写入tensorboad ##
    ################################
    sample = train_dataset[0:4][0]
    print(sample.shape)
    writer.add_graph(model, torch.tensor(sample).to(device))
    for epoch in range(start_epoch, hydra_cfg.train.max_epoch):
        ################################
        ##          模型训练           ##
        ################################
        logger.info(f"Start epoch {epoch+1}")
        train_loss = 0.0
        train_steps = 0
        train_hit_rate=None
        t_epoch_start=time.time()
        train_errors=None
        train_outputs=None
        train_labels=None
        model.train()
        for batch, (features, labels) in enumerate(trainloader):
            t_iteration_start=time.time()
            features=features.to(device)
            labels=labels.to(device)
            # zero the parameter gradients
            optimizer.zero_grad()
            # Compute prediction and loss
            outputs = model(features)
            loss = criterion(outputs, labels)
            labels_npy=labels.cpu().detach().numpy()
            outputs_npy=outputs.cpu().detach().numpy()
            # 生成自定义的指标
            error,hit_rate=data_preprocess.get_metric(outputs_npy, labels_npy)
            # Backpropagation
            loss.backward()
            optimizer.step()
            # 记录我们自定义的指标
            train_loss += loss.item()
            train_steps += 1
            if train_steps==1:
                train_hit_rate=hit_rate
                train_errors=error
                train_outputs=outputs_npy
                train_labels=labels_npy
            else:
                train_hit_rate=(train_steps-1)/train_steps *train_hit_rate+ 1/train_steps*hit_rate
                train_errors=np.concatenate((train_errors,error),axis=0)
                train_outputs=np.concatenate((train_outputs,outputs_npy),axis=0)
                train_labels=np.concatenate((train_labels,labels_npy),axis=0)
            t_iteration_end=time.time()
            print("Training Epoch:[{}/{}],Iteration:{}/{},Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_steps,len(trainloader),loss.item(),t_iteration_end-t_iteration_start))
        train_loss=train_loss/train_steps
        ################################
        ##          模型验证           ##
        ################################  
        val_loss = 0.0
        val_steps = 0
        val_hit_rate=None
        t_epoch_train_end=time.time()
        logger.info(f"Training costs {t_epoch_train_end-t_epoch_start}s")
        model.eval()
        for batch, (features, labels) in enumerate(valloader):
            t_iteration_start=time.time()
            with torch.no_grad():
                features=features.to(device)
                labels=labels.to(device)
                outputs = model(features)
                loss = criterion(outputs, labels)
                error,hit_rate=data_preprocess.get_metric(outputs.cpu().detach().numpy(), labels.cpu().detach().numpy())
                val_loss += loss.cpu().numpy()
                val_steps += 1
                if val_steps==1:
                    val_hit_rate=hit_rate
                else:
                    val_hit_rate=(val_steps-1)/val_steps *val_hit_rate+ 1/val_steps*hit_rate
            t_iteration_end=time.time()
            print("Validate Iteration:{}/{},Loss:{}, Cost {}s".format(val_steps,len(valloader),loss.item(),t_iteration_end-t_iteration_start))
        t_epoch_val_end=time.time()
        logger.info(f"Validate costs {t_epoch_val_end-t_epoch_train_end}s")
        #####################################################
        ##   每个epoch保存checkpoint，并记录到tensorboard   ##
        #####################################################
        checkpoint_data = {
                "epoch": epoch,
                "net_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
            }
        with tempfile.TemporaryDirectory() as checkpoint_dir:
                data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
                with open(data_path, "wb") as fp:
                    pickle.dump(checkpoint_data, fp)
                checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)
                reports={"val_loss": val_loss / val_steps,
                     "train_loss": train_loss,
                     }
                for i,name in enumerate(hydra_cfg.labels_name):
                    reports[f"train_hit_rate_{name}"]=train_hit_rate[i]
                    reports[f"val_hit_rate_{name}"]=val_hit_rate[i]
                    writer.add_histogram(tag=f'train_error_{name}', values=train_errors[:,i], global_step=epoch)
                    fig, ax = plt.subplots()
                    ax.scatter(np.arange(train_outputs.shape[0]),train_outputs[:,i])
                    ax.scatter(np.arange(train_outputs.shape[0]),train_labels[:,i])
                    writer.add_figure(f'train_imgage_{name}', fig , epoch)
                    plt.close(fig)                
                ray.train.report(
                    reports,
                    checkpoint=checkpoint,
                )
        t_epoch_end=time.time()
        logger.info(f"Save Checkpoint costs {t_epoch_end-t_epoch_val_end}s")
        print("Training Epoch:[{}/{}], Average Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_loss,t_epoch_end-t_epoch_start))
        for i,name in enumerate(hydra_cfg.labels_name):
            print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end="  ")
            print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end="  ")
--- a/src/scripts/Naner_label_excel_standardization.py
+++ b/src/scripts/Naner_label_excel_standardization.py
@ -0,0 +1,71 @@
 '''
@File    :   Naner_label_excel_standardization.py
@Time    :   2024/08/09 10:25:21
@Author  :   Zhanpeng Yang
@Version :   0.0.1
@Contact :   zhpyang@outlook.com
@Desc    :   将钢厂给的、从其系统中导出的Label文件，转化为标准的格式，供后续使用
 '''
 import pandas as pd
 import os
 if __name__ =="__main__":
    raw_label_file_path="/home/admin/20240806-NanEr-5-8-data/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
    save_dir="/home/admin/20240806-NanEr-5-8-data/labels/NanEr"
    select_and_rename={
        "炉号":"Furnace_Number",
        "钢种": "Steel_Type",
        "SL1开始时间":"TSC_start_time",
        "SL1结束时间":"TSC_end_time",
        "TSC温度":"TSC_T",
        "TSC测量碳值":"TSC_C",
        "TSC化验碳值":"TSC_C_lab",
        "TSC化验磷值":"TSC_P_lab",
        "TSC化验锰值":"TSC_Mn_lab",
        "TSC化验铬值":"TSC_Cr_lab",
        "SL2开始时间":"TSO_start_time",
        "SL2结束时间":"TSO_end_time",
        "TSO温度":"TSO_T",
        "TSO测量碳值":"TSO_C",
        "TSO化验碳值":"TSO_C_lab",
        "TSO化验磷值":"TSO_P_lab",
        "TSO化验锰值":"TSO_Mn_lab",
        "TSO化验铬值":"TSO_Cr_lab",
    }
    ##读取文件
    raw_labels=pd.read_excel(raw_label_file_path)
    #只选出我们需要的列，从第一行之后再选是因为第一行还是列名
    selected_labels=raw_labels.loc[1:,select_and_rename.keys()]
    #修改列名
    selected_labels=selected_labels.rename(columns=select_and_rename)
    #下述功能集成到数据预处理中。因为每一行但凡有0就删，太严格，应该要求指定的label有0就删
    # # 选出有NULL的行
    # null_rows=selected_labels.isnull().any(axis=1)
    #  # 选出有0的行
    # zeros_rows=(selected_labels==0).any(axis=1)  | (selected_labels=='0').any(axis=1)
    # # 每一行但凡有NULL或者0都给删了
    # selected_rows=~(null_rows|zeros_rows)
    # selected_labels=selected_labels[selected_rows]
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_path=os.path.join(save_dir,f"{selected_labels.iloc[0]["TSC_start_time"].strftime("%Y-%m-%d")}_{selected_labels.iloc[-1]["TSO_end_time"].strftime("%m-%d")}_{selected_labels.shape[0]}.xlsx")
    selected_labels.to_excel(save_path,index=False)
    print(f"处理完毕的文件保存于:{save_path}")
--- a/src/test/h5py_vs_torch_read_speed.py
+++ b/src/test/h5py_vs_torch_read_speed.py
@ -0,0 +1,170 @@
 import torch
 import numpy as np
 import h5py
 import time
 data=np.random.random((60000,448,1024)).astype(np.float32)
 # #####numpy 保存，读取并遍历
 # t_1=time.time()
 # np.save("./temp/data.npy",data)
 # t_2=time.time()
 # save_time=t_2-t_1
 # print(f"Numpy save time:{save_time}s")
 # data=np.load("./temp/data.npy")
 # t_3=time.time()
 # load_time=t_3-t_2
 # print(f"Numpy load time:{load_time}s")
 # for i in range(data.shape[0]):
 #     tmp=data[i]
 #     # print(tmp)
 # t_4=time.time()
 # ergodic_time=t_4-t_3
 # print(f"Numpy ergodic time:{ergodic_time}s")
 # #####torch 保存，读取并遍历
 # data_torch=torch.tensor(data)
 # t_1=time.time()
 # torch.save(data_torch,"./temp/data.pt")
 # t_2=time.time()
 # save_time=t_2-t_1
 # print(f"Torch save time:{save_time}s")
 # data_torch=torch.load("./temp/data.pt",weights_only=True)
 # t_3=time.time()
 # load_time=t_3-t_2
 # print(f"Torch load time:{load_time}s")
 # for i in range(data.shape[0]):
 #     tmp=data[i]
 # t_4=time.time()
 # ergodic_time=t_4-t_3
 # print(f"Torch ergodic time:{ergodic_time}s")
 #####h5 保存，读取并遍历
 t_1=time.time()
 with h5py.File("./temp/data.h5", 'w') as f:
    h5_features = f.create_dataset(
                    'data',
                    data.shape,
                        dtype=np.float32,
                    )
    h5_features[...]=data[...]
 t_2=time.time()
 save_time=t_2-t_1
 print(f"HDF5 save time:{save_time}s")
 hdf5_f = h5py.File("./temp/data.h5", 'r')
 # load_data= hdf5_f["data"][...]
 t_3=time.time()
 load_time=t_3-t_2
 print(f"HDF5 load time:{load_time}s")
 for i in range(hdf5_f["data"].shape[0]):
    tmp=hdf5_f["data"][i]
 t_4=time.time()
 ergodic_time=t_4-t_3
 print(f"HDF5 ergodic time:{ergodic_time}s")
 for i in range(hdf5_f["data"].shape[0]):
    tmp=hdf5_f["data"][i]
 t_5=time.time()
 ergodic_time=t_5-t_4
 print(f"HDF5 twice ergodic time:{ergodic_time}s")
 hdf5_f.close()
 #####h5 保存，读取并遍历
 t_1=time.time()
 t_2=time.time()
 save_time=t_2-t_1
 print(f"HDF5  [()] save time:{save_time}s")
 hdf5_f = h5py.File("./temp/data_tmp.h5", 'r')
 data=hdf5_f["data"][()]
 # load_data= hdf5_f["data"][...]
 t_3=time.time()
 load_time=t_3-t_2
 print(f"HDF5  [()] load time:{load_time}s")
 for i in range(data.shape[0]):
    tmp=data[i]
 t_4=time.time()
 ergodic_time=t_4-t_3
 print(f"HDF5  [()] ergodic time:{ergodic_time}s")
 for i in range(data.shape[0]):
    tmp=data[i]
 t_5=time.time()
 ergodic_time=t_5-t_4
 print(f"HDF5 [()] twice ergodic time:{ergodic_time}s")
 # for i in range(hdf5_f["data"].shape[0]):
 #     tmp=load_data[i]
 # t_6=time.time()
 # ergodic_time=t_6-t_5
 # print(f"HDF5 loaded ergodic time:{ergodic_time}s")
 hdf5_f.close()
 # #####h5 Optimized 保存，读取并遍历
 # t_1=time.time()
 # with h5py.File("./temp/data_optimzed.h5", 'w') as f:
 #     h5_features = f.create_dataset(
 #                     'data',
 #                     data.shape,
 #                     maxshape=(None,448,1024),
 #                         chunks=(1,448,1024), # 手动设置 chunk 大小为一次数据的大小
 #                         # compression='gzip',
 #                         # compression_opts=1,
 #                         dtype=np.float32,
 #                     )
 #     h5_features[...]=data[...]
 # t_2=time.time()
 # save_time=t_2-t_1
 # print(f"HDF5 Optimized save time:{save_time}s")
 # hdf5_f = h5py.File("./temp/data_optimzed.h5", 'r',
 #                     rdcc_nbytes=1024**2*1024*20,  # 10G 缓存总大小，单位字节。应设置为 chunk 的整数倍
 #                     rdcc_w0=0,  # 缓存淘汰策略。值越接近 0，优先淘汰最近最少使用的 chunk。值越接近 1，优先淘汰已完全读取或写入的块
 #                     # rdcc_nslots=1e8,  # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍，为达最佳性能需要 100 倍。默认为 521
 #         )
 # data=hdf5_f["data"]
 # loaddata=data[...]
 # t_3=time.time()
 # load_time=t_3-t_2
 # print(f"HDF5 Optimized load time:{load_time}s")
 # for i in range(data.shape[0]):
 #     tmp=data[i]
 # t_4=time.time()
 # ergodic_time=t_4-t_3
 # print(f"HDF5 Optimized ergodic time:{ergodic_time}s")
 # for i in range(data.shape[0]):
 #     tmp=data[i]
 #     # print(tmp)
 # t_5=time.time()
 # ergodic_time=t_5-t_4
 # print(f"HDF5 Optimized twice ergodic time:{ergodic_time}s")
 # hdf5_f.close