init repository

This commit is contained in:
user 2024-09-11 19:15:44 +08:00
commit cfac5fd675
40 changed files with 1757 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
dataset
rawdata
labels
logs

5
CHANGELOG.md Normal file
View File

@ -0,0 +1,5 @@
# Change Log
## v0.1.0.20240910_alpha
- 跑通

9
README.md Normal file
View File

@ -0,0 +1,9 @@
源代码文件在`src`文件夹下。
```
cd src
python main_train.py
```
pip install hydra-core "ray[tune]" torchinfo pip install h5py tensorboard matplotlib openpyxl

0
TODO.md Normal file
View File

View File

@ -0,0 +1,21 @@
# https://hydra.cc/docs/configure_hydra/intro/
# enable color logging
# defaults:
# - override hydra_logging: colorlog
# - override job_logging: colorlog
# output directory, generated dynamically on each run
run:
dir: ../logs/${task_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
# sweep:
# dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
# subdir: ${hydra.job.num}
job_logging:
handlers:
file:
# Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
verbose: [__main__,hydra]

20
configs/main_train.yaml Normal file
View File

@ -0,0 +1,20 @@
defaults:
- _self_
- hydra: default
- ray_tune: default
task_name: main
raw_spectral_data_dir: /code/admin/20240806-NanEr-5-8-data/rawdata
raw_labels_dir: /code/admin/20240806-NanEr-5-8-data/labels/NanEr
dataset_dir: /code/admin/20240806-NanEr-5-8-data/dataset
labels_name: ["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
#是
dataset:
train_ratio: 0.8
validate_ratio: 0.1
num_worker: 128
train:
max_epoch: 10000

View File

@ -0,0 +1,51 @@
run:
num_samples: 1
resources_per_trial:
cpu: 128
gpu: 1
scheduler:
_target_: ray.tune.schedulers.ASHAScheduler
metric: val_loss
mode: min
max_t: 20000
grace_period: 1
reduction_factor: 2
config:
choose_frame_spatial:
_target_: ray.tune.grid_search
values:
- _target_: data.choose_frame_spatial.mean.ChooseFrameSpatial
interval: [-3,0]
features_scaling:
_target_: ray.tune.grid_search
values:
- _target_: data.features_scaling.max_min.FeatureScaling
labels_scaling:
_target_: ray.tune.grid_search
values:
- _target_: data.labels_scaling.max_min.LabelScaling
model:
_target_: ray.tune.grid_search
values:
# - _target_: model.FCNN.FCNN
# - _target_: model.CNN_LSTM_FCNN.CNN_LSTM_FCNN
- _target_: model.CNN1D_FCNN.CNN1D_FCNN
criterion:
_target_: ray.tune.grid_search
values:
- _target_: torch.nn.MSELoss
optimizer:
_target_: ray.tune.grid_search
values:
- _target_: torch.optim.Adam
_partial_: true
lr: 0.0001
batch_size:
_target_: ray.tune.grid_search
values:
- 128

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,100 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
class ChooseFrameSpatial:
def __init__(self, interval=[-3,0]):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
Args:
interval (list, optional): 此方法依赖的光谱时间范围默认[-30,30]即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
"""
self.description=f"{str(self.__class__).split(".")[2]}_{interval[0]}_{interval[1]}"
# print(self.description)
self.interval=interval
def get_data_interval(self, start_time, end_time ):
return start_time+datetime.timedelta(seconds=self.interval[0]),start_time+datetime.timedelta(seconds=self.interval[1])
def run(self,timestamps,rawdata):
############# 空间数据压缩 ######################
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
space_num= 10 #选取方差最小的点的个数
light_indices = np.where( (space_intensity_mean > (0.25*np.max(space_intensity_mean)+0.75*np.min(space_intensity_mean)))
& (space_intensity_mean < (0.8*np.max(space_intensity_mean)+0.2*np.min(space_intensity_mean)))
)
if light_indices[0].shape[0] < space_num:
print('No Enough Usable Space Point Found!')
return None
intensity_data = rawdata[:,:,light_indices[0]]
spatial_variance = np.var(np.sum(intensity_data,axis=1), axis=0)
top_10_indices = np.argsort(spatial_variance)[:space_num]
space_selected_data = intensity_data[:, :, top_10_indices]
############# 时间数据压缩 ######################
# framerate = 24
# timewindow = 2
# time_data_intensity = np.sum(np.mean(space_selected_data,axis=2),axis=1)
# min_var = float('inf')
# min_index = 0
# for i in range(len(time_data_intensity)-framerate*timewindow-1):
# window_var = np.var(time_data_intensity[i:i+framerate*timewindow])
# if window_var < min_var:
# min_var = window_var
# min_index = i
# selected_data = space_selected_data[min_index:min_index+framerate*timewindow,:,:]
selected_data=np.mean(np.mean(space_selected_data,axis=0),axis=1)
# print("timewindow_begin=",min_index)
# if (result_dir is not None) and (filenum is not None) :
# for i in range (selected_data.shape[2]):
# Z=selected_data[:,:,i]
# x=np.linspace(400,1000,Z.shape[1])
# y=selected_data[:,1,i]
# x, y = np.meshgrid(x, y)
# fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
# surf = ax.plot_surface(x, y, Z, cmap=cm.Blues,
# linewidth=0, antialiased=False)
# ax.view_init(elev=30, azim=-60) # 更改视角
# ax.set_zlim(0, 4095)
# ax.set_zlabel("Light Intensity")
# ax.set_xlabel("Spectral Band(nm)")
# ax.set_ylabel("Time (Serial Number)")
# plt.savefig(f"{result_dir}{ filenum} file {i}-th spatial_point_line.png")
# plt.close()
return selected_data
if __name__ =="__main__":
timpestamp=np.zeros((600,))
input_data=np.random.random((600,224,512))*4095
tmp=ChooseFrameSpatial()
output=tmp.run(timpestamp,input_data)
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")

View File

@ -0,0 +1,132 @@
'''
@File : mean.py
@Time : 2024/08/12 13:53:36
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 数据第一阶段预处理从连续光谱数据中选出合适的光谱作为input的特征
'''
import datetime
import numpy as np
class ChooseFrameSpatial:
def __init__(self, interval=[-30,30],overexposion_threshold = 1,weight_mean_var = 0.5,space_num= 10, time_num = 9,framerate = 4,timewindow_time = 1,Type=6):
"""注意,算法所需要的超参数均从此处传入,作为类成员,其他函数不允许传入超参数
Args:
interval (list, optional): 此方法依赖的光谱时间范围默认[-30,30]即获取前TSC开始时刻前30秒到TSC结束时刻的光谱数据作为此算法输入.
"""
# print(str(self.__class__).split("."))
self.description=f"{str(self.__class__).split(".")[-2]}_{interval[0]}_{interval[1]}"
# print(self.description)
self.overexposion_threshold = overexposion_threshold #过曝阈值:有多少时间点过曝时,判定为这个空间点过曝
self.weight_mean_var =weight_mean_var #强度/方差选取权重默认平均光强权重为1此权重为方差权重取值为[0
self.space_num= space_num #选取空间点数量
self.time_num = time_num #选取时间点数量
self.framerate = framerate #采样帧率
self.timewindow = framerate * timewindow_time #选取时间窗口为1s
self.interval=interval
self.Type=Type
def get_data_interval(self, start_time, end_time ):
return start_time+datetime.timedelta(seconds=self.interval[0]),end_time+datetime.timedelta(seconds=self.interval[1])
def Overexposed_spatial_point_detection(self,inputdata, timethreshold):
#判别一个空间点是否过曝inputdata是timewavelength二维数组timethreshold是时间阈值有多少个时间点过曝则认为这个点过曝
# 如果过曝返回False不过曝返回True
row_max_values= np.max(inputdata, axis=1)
overexposed_rows = np.sum(row_max_values >= 4095)
if overexposed_rows > timethreshold:
return False
else:
return True
def run(self,timestamps,rawdata):
# 降维方法空间上选取强度达到一定的阈值且方差最小的点时间上把整个时间段划分并选取各段权重最高的1s进行平均Type=1取前1/3Type=2取中部1/3Type=3取最后1/3Type=4取全部
############# 空间过曝点去除 ######################
unoverposed_indices = [i for i in range(rawdata.shape[2]) if self.Overexposed_spatial_point_detection(rawdata[:, :, i],self.overexposion_threshold)]
rawdata = rawdata[:,:,unoverposed_indices]
############# 空间数据压缩 ######################
space_intensity_mean = np.mean(np.sum(rawdata, axis=1), axis=0)
space_intensity_var = np.var(np.sum(rawdata, axis=1), axis=0)
combined_score = (space_intensity_mean - np.min(space_intensity_mean)) / (np.max(space_intensity_mean) - np.min(space_intensity_mean)) \
- self.weight_mean_var * (space_intensity_var - np.min(space_intensity_var)) / (np.max(space_intensity_var) - np.min(space_intensity_var))
sorted_indices = np.argsort(combined_score)[::-1]
space_selected_data = rawdata[:,:,sorted_indices[:self.space_num]]
############# 时间数据压缩 ######################
space_selected_data_intensity = np.sum(space_selected_data,axis=1)
#按照时间把强度数据拆分成num_time份每一份维度为总时间点数/time_num, space_num
time_length = space_selected_data_intensity.shape[0]
chunk_size = time_length // self.time_num
remainder = time_length % self.time_num
start_index = 0
time_selected_data =None
for i in range(self.time_num):
end_index = start_index + chunk_size + (1 if i < remainder else 0)
chunk = space_selected_data_intensity[start_index:end_index, :]
window_var = np.empty(0)
window_mean = np.empty(0)
for j in range(len(chunk)-self.timewindow-1):
window_var = np.concatenate((window_var, np.expand_dims( np.sum(np.var(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
window_mean = np.concatenate((window_mean, np.expand_dims( np.sum(np.mean(chunk[j:j+self.timewindow,:], axis=0)),0)), axis=0)
combined_score_time = (window_mean - np.min(window_mean)) / (np.max(window_mean) - np.min(window_mean)) \
- self.weight_mean_var * (window_var - np.min(window_var)) / (np.max(window_var) - np.min(window_var))
sorted_indices = np.argsort(combined_score_time)[::-1]
time_window_data = np.mean(space_selected_data[start_index+sorted_indices[0]:start_index+sorted_indices[0]+self.timewindow, :, :],axis=0)
# print(time_selected_data.shape,time_window_data.shape,np.expand_dims( time_window_data,0).shape)
if time_selected_data is None:
time_selected_data=np.expand_dims( time_window_data,0)
else:
time_selected_data = np.concatenate((time_selected_data, np.expand_dims( time_window_data,0)), axis=0)
start_index = end_index
if self.Type == 1:
print("time_selected_data[前1/3].shape: ", time_selected_data[:self.time_num//3,:,:].shape)
return time_selected_data[:self.time_num//3,:,:]
elif self.Type == 2:
print("time_selected_data[中1/3].shape: ", time_selected_data[self.time_num//3:2*self.time_num//3,:,:].shape)
return time_selected_data[self.time_num//3:2*self.time_num//3,:,:]
elif self.Type == 3:
print("time_selected_data[后1/3].shape: ", time_selected_data[2*self.time_num//3:,:,:].shape)
return time_selected_data[2*self.time_num//3:,:,:]
elif self.Type == 4:
print("time_selected_data[前2/3].shape: ", time_selected_data[:2*self.time_num//3,:,:].shape)
return time_selected_data[:2*self.time_num//3,:,:]
elif self.Type == 5:
print("time_selected_data[后2/3].shape: ", time_selected_data[self.time_num//3:,:,:].shape)
return time_selected_data[self.time_num//3:,:,:]
elif self.Type == 6:
print("time_selected_data.shape: ", time_selected_data.shape)
return time_selected_data
else:
print("Type is not 1, 2, 3, 4, 5 or 6 !")
return None
if __name__ =="__main__":
timpestamp=np.zeros((600,))
input_data=np.random.random((600,224,512))*4095
tmp=ChooseFrameSpatial()
output=tmp.run(timpestamp,input_data)
print(f"输入数据维度:{input_data.shape}\n输出数据维度:{output.shape}")

View File

@ -0,0 +1,30 @@
'''
@File : max_min.py
@Time : 2024/08/12 14:02:59
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 对输入特征进行标准化放缩
'''
import numpy as np
class FeatureScaling:
def __init__(self):
self.description=f"{str(self.__class__).split(".")[-2]}"
self.flag_get_global_info=True
self._min=999999999
self._max=0
def get_global_info(self,feature):
tmp_max=np.max(feature)
tmp_min=np.min(feature)
if tmp_max>self._max:
self._max=tmp_max
if tmp_min<self._min:
self._min=tmp_min
def run( self,feature):
feature=(feature-self._min)/(self._max-self._min)
return feature

View File

@ -0,0 +1,45 @@
'''
@File : max_min.py
@Time : 2024/08/12 14:03:38
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 对输出标签进行标准化放缩
'''
import numpy as np
class LabelScaling:
def __init__(self):
self.description=f"{str(self.__class__).split(".")[-2]}"
self.flag_get_global_info=True
self._min=None
self._max=None
def get_global_info(self,label):
label_dim=label.shape[0]
if self._max is None:
self._min=np.zeros((label_dim))+999999999
self._max=np.zeros((label_dim))
for i in range(label_dim):
# tmp_max=np.max(label)
# tmp_min=np.min(label)
if label[i]>self._max[i]:
self._max[i]=label[i]
if label[i]<self._min[i]:
self._min[i]=label[i]
def run(self,label):
for i in range(label.shape[0]):
label[i]=(label[i]-self._min[i])/(self._max[i]-self._min[i])
return label
def reverse(self,labels):
for i in range(labels.shape[1]):
labels[:,i]=labels[:,i]*(self._max[i]-self._min[i])+self._min[i]
return labels

373
src/data/pre_process.py Normal file
View File

@ -0,0 +1,373 @@
'''
@File : pre_process.py
@Time : 2024/08/09 13:54:28
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 进行数据预处理
'''
import pandas as pd
import logging
import os,glob
import numpy as np
import datetime
import pickle
import h5py
class DataPreProcess:
"""
包含所有数据预处理操作包括
1. 从原始二进制数据以钢厂 提供的Excel文件中提取出目标时刻附近的光谱
2. 从目标时刻附近的光谱选择合适稳定的时间及空间点的光谱作为feature
3. 将feature Label进行放缩
"""
def __init__(self, raw_spectral_data_dir, raw_labels_dir,labels_name,dataset_dir,choose_frame_spatial, features_scaling,labels_scaling,train_ratio=0.8,validate_ratio=0.1) -> None:
"""初始化类,传入所有数据预处理需要的参数
Args:
raw_spectral_data_dir (_type_):原始光谱数据路径
raw_labels_dir (_type_): 原始标签路径
labels_name (_type_): 提取出的标签名
dataset_dir (_type_): 生成的数据集文件夹
choose_frame_spatial (_type_): 类对象进行光谱特征挑选
features_scaling (_type_): 类对象对输入的特征进行放缩使之直接输入神经网络
labels_scaling (_type_): 类对象对输出的的标签进行放缩
train_ratio (float, optional): 训练集比例. Defaults to 0.8.
validate_ratio (float, optional): 验证集比例剩余的即为测试集. Defaults to 0.1.
"""
self.raw_spectral_data_dir=raw_spectral_data_dir
self.raw_labels_dir=raw_labels_dir
self.dataset_dir=dataset_dir
self.labels_name=labels_name
self.choose_frame_spatial=choose_frame_spatial
self.features_scaling=features_scaling
self.labels_scaling=labels_scaling
self.train_ratio=train_ratio
self.validate_ratio=validate_ratio
#加载原始的标签
self.raw_labels=self._load_raw_labels(raw_labels_dir,labels_name)
#加载原始光谱的缓存
self.raw_spectral_data_cache=self._load_raw_spectral_data_cache(raw_spectral_data_dir)
#随便加载一个csv文件判断光谱数据的维度
self.spectral_dim,self.spatial_dim= self._get_spectral_spatial_dim(self.raw_spectral_data_cache)
#正式开始原始数据转化为数据集
self._raw_data_2_dataset()
def _load_raw_labels(self,raw_labels_dir:str,labels_name:list)->pd.DataFrame:
"""读取利用脚本处理后的钢厂给的excel文件所在文件夹会扫描所有文件
并选择指定的label名作为output
并去除值为NaN或0的行
Args:
raw_labels_dir (str): 利用脚本处理后的钢厂给的excel路径
labels_name (list): 指定的作为Label的列
Returns:
pd.DataFrame: 返回所有筛选后的炉次数据
"""
raw_labels=None
for name in os.listdir(raw_labels_dir):
tmp_raw_labels=pd.read_excel(os.path.join(raw_labels_dir,name))
choosed_column=["TSC_start_time","TSC_end_time"]
choosed_column=choosed_column+labels_name
#只选出我们想要的部分作为标签
tmp_raw_labels=tmp_raw_labels.loc[:,choosed_column]
# 选出有NULL的行
null_rows=tmp_raw_labels.isnull().any(axis=1)
# # 选出有0的行
zeros_rows=(tmp_raw_labels==0).any(axis=1) | (tmp_raw_labels=='0').any(axis=1)
# # 每一行但凡有NULL或者0都给删了
selected_rows=~(null_rows|zeros_rows)
tmp_raw_labels=tmp_raw_labels[selected_rows]
if raw_labels is None:
raw_labels=tmp_raw_labels
else:
raw_labels=pd.concat([raw_labels,tmp_raw_labels],axis=0)
logging.debug(f"Reading raw label excel file:{name}, which has {tmp_raw_labels.shape[0]} furnaces")
logging.debug(f"Readed raw label excel files, which has {raw_labels.shape[0]} furnaces in total")
return raw_labels
def _load_raw_spectral_data_cache(self,raw_spectral_data_dir:str)->list:
"""生成所有原始光谱数据文件的缓存,包括每个文件记录的开始及结束时间,目的为加快后面读取原始数据的速度
Args:
raw_spectral_data_dir (str): 原始光谱所在路径
Returns:
list: 缓存其中有多少个成员就有多少个原始数据文件每个成员包括格式为datetime的开始及结束时间及文件路径
"""
spectral_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.csv"))
cache_file_paths=glob.glob(os.path.join(raw_spectral_data_dir,"*.pkl"))
update_flag=False
#不存在缓存文件,以及缓存文件中数据文件个数与文件夹中的文件个数不一致,均重新生成
if len(cache_file_paths)==0:
logging.debug(f"Raw spectral data cache is not existed! Generating")
update_flag=True
elif len(cache_file_paths)==1:
with open(cache_file_paths[0],"rb") as f:
raw_spectral_data_cache=pickle.load(f)
if len(raw_spectral_data_cache) !=len(spectral_file_paths):
logging.debug(f"Raw spectral data cache is out of date! Regenerating, cache file number:{len(raw_spectral_data_cache)}, spectral data file number: {len(spectral_file_paths)}")
update_flag=True
else:
logging.error(f"More the one 'raw_spectral_data_cache.pkl' file is existed in {raw_spectral_data_dir}")
if update_flag:
spectral_file_paths.sort()
raw_spectral_data_cache=[]
for file in spectral_file_paths:
tmp_info={}
tmp_data=np.loadtxt(file, delimiter=",")
start_t=datetime.datetime.fromtimestamp(tmp_data[0,0]/1000)+datetime.timedelta(microseconds=tmp_data[0,0]%1000)
end_t=datetime.datetime.fromtimestamp(tmp_data[-1,0]/1000)+datetime.timedelta(microseconds=tmp_data[-1,0]%1000)
tmp_info["start_t"]=start_t
tmp_info["end_t"]=end_t
tmp_info["file_path"]=file
raw_spectral_data_cache.append(tmp_info)
with open(os.path.join(raw_spectral_data_dir,f"raw_spectral_data_cache.pkl"),"wb") as f:
pickle.dump(raw_spectral_data_cache,f)
return raw_spectral_data_cache
def _get_spectral_spatial_dim(self,raw_spectral_data_cache):
data=np.loadtxt(raw_spectral_data_cache[0]["file_path"], delimiter=",").astype(np.uint64)
if data[0,2]==229376:
spectral_dim =224
spatial_dim=512
if data[0,2]==917504:
spectral_dim =448
spatial_dim=1024
return spectral_dim, spatial_dim
def _read_spectral_data(self,start_time:datetime.datetime,end_time:datetime.datetime)->np.ndarray:
"""获取从start_time到end_time的光谱数据
Args:
start_time (datetime.datetime): 开始时间
end_time (datetime.datetime): 结束时间
Returns:
np.ndarray: 原始光谱数据
"""
def get_spectral_data_per_file(file_path,s_t,e_t):
data=np.loadtxt(file_path, delimiter=",").astype(np.uint64)
if s_t is not None:
tmp_s=datetime.datetime.timestamp(s_t)*1000
tmp_s_index=0
for i in range(data.shape[0]-1):
if data[i,0]<=tmp_s and data[i+1,0]>=tmp_s:
tmp_s_index=i
break
else:
tmp_s_index=0
if e_t is not None:
tmp_e=datetime.datetime.timestamp(e_t)*1000
tmp_e_index=data.shape[0]
for i in range(tmp_s_index,data.shape[0]-1):
if data[i,0]<=tmp_e and data[i+1,0]>=tmp_e:
tmp_e_index=i
break
else:
tmp_e_index=data.shape[0]
with open(file_path[:-3]+"bin", "rb") as f:
f.seek(data[tmp_s_index,1])
d=f.read(np.uint64((tmp_e_index-tmp_s_index)*data[tmp_s_index,2]))
d=np.frombuffer(d, dtype=np.uint16).reshape(tmp_e_index-tmp_s_index,self.spectral_dim,self.spatial_dim)
return data[tmp_s_index:tmp_e_index,0],d
timestamps=None
raw_spectral_data=None
for tmp_info in self.raw_spectral_data_cache:
tmp_data=None
if start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"] and end_time<tmp_info["end_t"]:
# 目标时间段在交于此文件时间段的左侧。所以取从文件开始到end time的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,end_time)
elif start_time<tmp_info["start_t"] and end_time>tmp_info["end_t"]:
# 目标时间段完全包含此文件时间段。所以取从文件开始到结束的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,None,None)
elif start_time>tmp_info["start_t"] and end_time<tmp_info["end_t"]:
# 目标时间段完全被包含此文件时间段。所以取从start_time到end_time的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,end_time)
elif start_time>tmp_info["start_t"] and start_time<tmp_info["end_t"] and end_time>tmp_info["end_t"]:
# 目标时间段在交于此文件时间段的右侧。所以取从文件start_time到文件结束的数据
tmp_time_stamp,tmp_data=get_spectral_data_per_file(tmp_info["file_path"] ,start_time,None)
if tmp_data is not None:
if raw_spectral_data is None:
timestamps=tmp_time_stamp
raw_spectral_data=tmp_data
else:
timestamps=np.concatenate((timestamps,tmp_time_stamp),axis=0)
raw_spectral_data=np.concatenate((raw_spectral_data,tmp_data),axis=0)
return timestamps,raw_spectral_data
def _raw_data_2_dataset(self):
save_dir=os.path.join(self.dataset_dir,self.choose_frame_spatial.description)
#第一步,进行特征时间与空间点选取,并保存
pre_dataset_save_dir=os.path.join(save_dir,"data")
if not os.path.exists(pre_dataset_save_dir):
os.makedirs(pre_dataset_save_dir)
#数据路径不存在就重新生成,如果存在,就跳过这部分。
# 因此需要额外注意如果有新数据需要把dataset下的文件夹都删除相当于删除缓存重新生成
for i in range(self.raw_labels.shape[0]):
start_time,end_time=self.choose_frame_spatial.get_data_interval(self.raw_labels.iloc[i]["TSC_start_time"],self.raw_labels.iloc[i]["TSC_end_time"])
timestamps,raw_spectral_data=self._read_spectral_data(start_time,end_time)
if raw_spectral_data is not None:
#获取到的数据帧率大于2才开始记录
if raw_spectral_data.shape[0]>2*(end_time-start_time).total_seconds():
logging.debug(f"PreProcess Stage 1: [{i+1}/{self.raw_labels.shape[0]}] with {timestamps.shape[0]} frames")
raw_spectral_data=self.choose_frame_spatial.run(timestamps,raw_spectral_data)
np.savez(os.path.join(pre_dataset_save_dir,f"{timestamps[0]}_{timestamps.shape[0]}.npz",),timestamps=timestamps,raw_spectral_data=raw_spectral_data,raw_labels=self.raw_labels.iloc[i][self.labels_name].to_numpy())
else:
logging.info(f"Pre Dataset is existed in {pre_dataset_save_dir}")
#第二步,进行标准化,并形成数据集
self.dataset_save_dir=os.path.join(save_dir,f"{self.features_scaling.description}_{self.labels_scaling.description}")
if not os.path.exists(self.dataset_save_dir):
os.makedirs(self.dataset_save_dir)
pre_dataset_files=os.listdir(pre_dataset_save_dir)
if self.features_scaling.flag_get_global_info:
for name in pre_dataset_files:
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
self.features_scaling.get_global_info(tmp_data["raw_spectral_data"])
if self.labels_scaling.flag_get_global_info:
self.labels_scaling.get_global_info(tmp_data["raw_labels"])
# 获取维度信息
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
feature_dim=tmp_data["raw_spectral_data"].shape
label_dim=tmp_data["raw_labels"].shape
#划分训练集_验证集_测试集
np.random.shuffle(pre_dataset_files)
[train_dateset_files,validate_dateset_files]=np.split(pre_dataset_files,[int(len(pre_dataset_files)*self.train_ratio)])
[validate_dateset_files,test_dateset_files]=np.split(validate_dateset_files,[int(len(pre_dataset_files)*self.validate_ratio)])
#写入HDF5文件
for dataset_name in ["train","validate","test"]:
if dataset_name=="train":
file_names=train_dateset_files
elif dataset_name=="validate":
file_names=validate_dateset_files
elif dataset_name=="test":
file_names=test_dateset_files
logging.info(f"Generating {dataset_name} dataset with {len(file_names)} samples")
with h5py.File(os.path.join( self.dataset_save_dir,f"{dataset_name}.h5"), 'w') as f:
h5_features = f.create_dataset(
'features',
tuple([len(file_names)]+list(feature_dim)),
maxshape=(tuple([None]+list(feature_dim))),
chunks=tuple([1]+list(feature_dim)), # 手动设置 chunk 大小为一次数据的大小tuple([1]+list(feature_dim))
# compression='gzip',
# compression_opts=9,
dtype=np.float32,
)
h5_labels = f.create_dataset(
'labels',
tuple([len(file_names)]+list(label_dim)),
chunks=tuple([1]+list(label_dim)), # 手动设置 chunk 大小为一次数据的大小
dtype=np.float32,
)
for i,name in enumerate(file_names):
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
feature=self.features_scaling.run(tmp_data["raw_spectral_data"])
label=self.labels_scaling.run(tmp_data["raw_labels"])
h5_features[i]=feature.astype(np.float32)
h5_labels[i]=label.astype(np.float32)
else:
pre_dataset_files=os.listdir(pre_dataset_save_dir)
if self.labels_scaling.flag_get_global_info:
for name in pre_dataset_files:
tmp_data=np.load(os.path.join(pre_dataset_save_dir,name),allow_pickle=True)
self.labels_scaling.get_global_info(tmp_data["raw_labels"])
logging.info(f"Standardized Dataset is existed in {self.dataset_save_dir}")
def get_metric(self,outputs,labels):
outputs=self.labels_scaling.reverse(outputs)
labels=self.labels_scaling.reverse(labels)
error=outputs-labels
# print("outputs",outputs)
# print("labels",labels)
# print("errors",error)
hit_rate=np.zeros(error.shape[1])
for i,name in enumerate(self.labels_name):
# error[:,i]=outputs[:,i]-labels[:,i]
#["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
if name =="TSC_T":
bound=10
elif name=="TSC_C":
bound=0.05
elif name=="TSC_C_lab":
bound=0.05
elif name=="TSC_P_lab":
bound=0.0005
hit_rate[i]=((error[:,i]>=-bound) &(error[:,i]<=bound)).sum() /error.shape[0]
return error,hit_rate
if __name__=="__main__":
logging.basicConfig(level = logging.DEBUG)
raw_data_dir="/code/admin/20240806-NanEr-5-8-data/rawdata"
labels_path="/code/admin/20240806-NanEr-5-8-data/labels/NanEr"
dataset_dir="/code/admin/20240806-NanEr-5-8-data/dataset"
labels_name=["TSC_T","TSC_C","TSC_C_lab","TSC_P_lab"]
from choose_frame_spatial.mean import ChooseFrameSpatial
from features_scaling.max_min import FeatureScaling
from labels_scaling.max_min import LabelScaling
choose_frame_spatial=ChooseFrameSpatial(interval=[-30,30])
features_scaling=FeatureScaling()
labels_scaling=LabelScaling()
data_pre_process=DataPreProcess(raw_data_dir,labels_path,labels_name,dataset_dir,choose_frame_spatial,features_scaling,labels_scaling)

View File

@ -0,0 +1,50 @@
import torch
import h5py
import os
import time
def load_dataset(dataset_dir):
train_dataset=SpectralDataset(os.path.join(dataset_dir,"train.h5"))
val_dataset=SpectralDataset(os.path.join(dataset_dir,"validate.h5"))
return train_dataset, val_dataset
class SpectralDataset(torch.utils.data.Dataset):
def __init__(self, hdf5_path):
self.hdf5_path=hdf5_path
self.hdf5_f = h5py.File(hdf5_path, 'r',
# rdcc_nbytes=1024**2*1024*10, # 10G 缓存总大小,单位字节。应设置为 chunk 的整数倍
# rdcc_w0=0, # 缓存淘汰策略。值越接近 0优先淘汰最近最少使用的 chunk。值越接近 1优先淘汰已完全读取或写入的块
# rdcc_nslots=1e8, # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍,为达最佳性能需要 100 倍。默认为 521
)
self.features=self.hdf5_f["features"]
self.labels=self.hdf5_f["labels"]
def __len__(self):
return self.features.shape[0]
def __getitem__(self, idx):
#构造Input
feature=self.features[idx]
label=self.labels[idx]
return feature, label
if __name__ =="__main__":
training_data=SpectralDataset("/home/admin/20240806-NanEr-5-8-data/dataset/mean_-30_30/max_min_max_min/train.h5")
print("数据集大小为:",len(training_data))
print("输入Feature维度为", training_data[0][0].shape, "输出Label维度为",training_data[0][1].shape)
from torch.utils.data import DataLoader
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

View File

@ -0,0 +1,78 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torch
class CNN1D_FCNN(nn.Module):
def __init__(self,):
super(CNN1D_FCNN, self).__init__()
self.Conv2d_1=nn.Conv1d(in_channels=1, out_channels=256, kernel_size=3,stride=1)
self.Conv2d_2=nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3,stride=1)
self.MaxPool1d_1=nn.MaxPool1d(kernel_size=2, stride=2)
self.Conv2d_3=nn.Conv1d(in_channels=512, out_channels=256, kernel_size=3,stride=1)
self.Conv2d_4=nn.Conv1d(in_channels=256, out_channels=32, kernel_size=3,stride=1)
self.Conv2d_5=nn.Conv1d(in_channels=32, out_channels=3, kernel_size=2,stride=1)
self.flatten=nn.Flatten(start_dim=1, end_dim=2)
# self.fc_1=nn.Linear(in_features=645,out_features=1024) #2048->1024
# self.fc_2=nn.Linear(in_features=1024,out_features=512)
self.fc_3=nn.Linear(in_features=315,out_features=256)
self.fc_4=nn.Linear(in_features=256,out_features=128)
self.fc_5=nn.Linear(in_features=128,out_features=4)
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
def forward(self, x):
# input_shape=x.shape
x=torch.mean(x,dim=1,keepdim=True)
x=torch.mean(x,dim=3)
print(x.shape)
# x=torch.unsqueeze(x, 1)
x=F.tanh(self.Conv2d_1(x))
x=F.tanh(self.Conv2d_2(x))
x=self.MaxPool1d_1(x)
x=F.tanh(self.Conv2d_3(x))
x=F.tanh(self.Conv2d_4(x))
x=F.tanh(self.Conv2d_5(x))
x=self.flatten(x)
# x=F.tanh(self.fc_1(x))
# x=F.tanh(self.fc_2(x))
x=F.tanh(self.fc_3(x))
x=F.tanh(self.fc_4(x))
x=F.sigmoid(self.fc_5(x))
#
# x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
# x=nn.Flatten(start_dim=2, end_dim=4)(x)
# x,_=self.lstm_1(x)
# x=x[:,-1,:]
# x=nn.LeakyReLU()(self.fc_1(x))
# x=nn.LeakyReLU()(self.fc_2(x))
# x=nn.Sigmoid()(self.fc_3(x))
return x
if __name__=="__main__":
model=CNN1D_FCNN()
torchinfo.summary(model,input_size=(64, 48, 224, 10))

Binary file not shown.

Binary file not shown.

Binary file not shown.

1
src/inference/main.py Normal file
View File

@ -0,0 +1 @@
np

Binary file not shown.

12
src/inference/test.http Normal file
View File

@ -0,0 +1,12 @@
GET http://localhost:8000
###
POST http://localhost:8000/inference HTTP/1.1
Content-Type: application/json
{
"type": "a",
"a":"c"
}

17
src/inference/test.py Normal file
View File

@ -0,0 +1,17 @@
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
@app.get("/")
async def root():
return {"message": "Hello World"}
class Item(BaseModel):
type: str
@app.post("/inference/")
async def inference(data:Item):
print(data)
return {"message": "Hello World"}

56
src/main_train.py Normal file
View File

@ -0,0 +1,56 @@
import hydra
import omegaconf # DictConfig, OmegaConf
import logging
import ray,os
import ray.tune
from functools import partial
logger = logging.getLogger(__name__)
from optimizer.trainable import trainable
#引入系统配置文件
@hydra.main(version_base=None, config_path="../configs", config_name="main_train.yaml")
def main(hydra_cfg : omegaconf.DictConfig) -> None:
logger.info(f"**************Hydra Configs**************\n{omegaconf.OmegaConf.to_yaml(hydra_cfg)}")
#获取当前保存文件夹
hydra_output_dir = hydra.core.hydra_config.HydraConfig.get()
hydra_output_dir=hydra_output_dir['runtime']['output_dir']
#获取当前超参数配置
tune_cfg={}
for key in hydra_cfg.ray_tune.config.keys():
tune_cfg[key]=hydra.utils.instantiate(hydra_cfg.ray_tune.config[key],_recursive_=False)
#调度器
scheduler=hydra.utils.instantiate(hydra_cfg.ray_tune.scheduler)
result=ray.tune.run(
partial(trainable,hydra_cfg),
resources_per_trial=omegaconf.OmegaConf.to_container(hydra_cfg.ray_tune.run.resources_per_trial),
config=tune_cfg,
num_samples=hydra_cfg.ray_tune.run.num_samples,
scheduler=scheduler,
storage_path=hydra_output_dir,
log_to_file=os.path.join(hydra_output_dir,"ray-tune.log"),
name="ray-tune",
verbose=1
)
#保存最好的模型
best_trial = result.get_best_trial("val_loss", "min", "last")
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation loss: {best_trial.last_result['val_loss']}")
if __name__=="__main__":
main()

78
src/model/CNN1D_FCNN.py Normal file
View File

@ -0,0 +1,78 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torch
class CNN1D_FCNN(nn.Module):
def __init__(self,):
super(CNN1D_FCNN, self).__init__()
self.Conv2d_1=nn.Conv1d(in_channels=1, out_channels=1024, kernel_size=3,stride=1)
self.Conv2d_2=nn.Conv1d(in_channels=1024, out_channels=2048, kernel_size=3,stride=1)
self.MaxPool1d_1=nn.MaxPool1d(kernel_size=2, stride=2)
self.Conv2d_3=nn.Conv1d(in_channels=2048, out_channels=1024, kernel_size=3,stride=1)
self.Conv2d_4=nn.Conv1d(in_channels=1024, out_channels=256, kernel_size=3,stride=1)
self.Conv2d_5=nn.Conv1d(in_channels=256, out_channels=3, kernel_size=2,stride=1)
self.flatten=nn.Flatten(start_dim=1, end_dim=2)
# self.fc_1=nn.Linear(in_features=645,out_features=1024) #2048->1024
# self.fc_2=nn.Linear(in_features=1024,out_features=512)
self.fc_3=nn.Linear(in_features=315,out_features=256)
self.fc_4=nn.Linear(in_features=256,out_features=128)
self.fc_5=nn.Linear(in_features=128,out_features=4)
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
def forward(self, x):
# input_shape=x.shape
# x=torch.mean(x,dim=1,keepdim=True)
# x=torch.mean(x,dim=3)
# print(x.shape)
x=torch.unsqueeze(x, 1)
x=F.tanh(self.Conv2d_1(x))
x=F.tanh(self.Conv2d_2(x))
x=self.MaxPool1d_1(x)
x=F.tanh(self.Conv2d_3(x))
x=F.tanh(self.Conv2d_4(x))
x=F.tanh(self.Conv2d_5(x))
x=self.flatten(x)
# x=F.tanh(self.fc_1(x))
# x=F.tanh(self.fc_2(x))
x=F.tanh(self.fc_3(x))
x=F.tanh(self.fc_4(x))
x=F.sigmoid(self.fc_5(x))
#
# x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
# x=nn.Flatten(start_dim=2, end_dim=4)(x)
# x,_=self.lstm_1(x)
# x=x[:,-1,:]
# x=nn.LeakyReLU()(self.fc_1(x))
# x=nn.LeakyReLU()(self.fc_2(x))
# x=nn.Sigmoid()(self.fc_3(x))
return x
if __name__=="__main__":
model=CNN1D_FCNN()
torchinfo.summary(model,input_size=(64, 224))

76
src/model/CNN_FCNN.py Normal file
View File

@ -0,0 +1,76 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torch
class CNN_FCNN(nn.Module):
def __init__(self,):
super(CNN_FCNN, self).__init__()
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=512, kernel_size=3,stride=1)
self.Conv2d_2=nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3,stride=1)
# self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2)
self.Conv2d_3=nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3,stride=1)
self.Conv2d_4=nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3,stride=1)
self.Conv2d_5=nn.Conv2d(in_channels=64, out_channels=6, kernel_size=2,stride=1)
self.flatten=nn.Flatten(start_dim=1, end_dim=3)
self.fc_1=nn.Linear(in_features=1290,out_features=2048) #2048->1024
self.fc_2=nn.Linear(in_features=2048,out_features=1024)
self.fc_3=nn.Linear(in_features=1024,out_features=512)
self.fc_4=nn.Linear(in_features=512,out_features=256)
self.fc_5=nn.Linear(in_features=256,out_features=4)
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
def forward(self, x):
# input_shape=x.shape
x=torch.mean(x,dim=1,keepdim=True)
# x=torch.unsqueeze(x, 1)
x=F.tanh(self.Conv2d_1(x))
x=F.tanh(self.Conv2d_2(x))
# x=self.MaxPool2d_1(x)
x=F.tanh(self.Conv2d_3(x))
x=F.tanh(self.Conv2d_4(x))
x=F.tanh(self.Conv2d_5(x))
x=self.flatten(x)
x=F.tanh(self.fc_1(x))
x=F.tanh(self.fc_2(x))
x=F.tanh(self.fc_3(x))
x=F.tanh(self.fc_4(x))
x=F.sigmoid(self.fc_5(x))
#
# x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
# x=nn.Flatten(start_dim=2, end_dim=4)(x)
# x,_=self.lstm_1(x)
# x=x[:,-1,:]
# x=nn.LeakyReLU()(self.fc_1(x))
# x=nn.LeakyReLU()(self.fc_2(x))
# x=nn.Sigmoid()(self.fc_3(x))
return x
if __name__=="__main__":
model=CNN_FCNN()
torchinfo.summary(model,input_size=(64, 48, 224, 10))

View File

@ -0,0 +1,69 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torch
class CNN_LSTM_FCNN(nn.Module):
def __init__(self,):
super(CNN_LSTM_FCNN, self).__init__()
# self.reshape_1=nn.Flatten(start_dim=0, end_dim=1)
self.Conv2d_1=nn.Conv2d(in_channels=1, out_channels=256, kernel_size=3,stride=1)
self.Conv2d_2=nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3,stride=1)
self.Conv2d_3=nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3,stride=1)
self.Conv2d_4=nn.Conv2d(in_channels=256, out_channels=32, kernel_size=3,stride=1)
self.MaxPool2d_1=nn.MaxPool2d(kernel_size=2, stride=2)
self.lstm_1=nn.LSTM(input_size=3456, hidden_size=2048,num_layers=2, batch_first=True)
self.fc_1=nn.Linear(in_features=2048,out_features=1024) #2048->1024
self.fc_2=nn.Linear(in_features=1024,out_features=256)
self.fc_3=nn.Linear(in_features=256,out_features=4)
# self.reshape_3=nn.Unflatten(dim=0,unflattened_size=(batch_size,601))
def forward(self, x):
input_shape=x.shape
x=nn.Flatten(start_dim=0, end_dim=1)(x)
x=torch.unsqueeze(x, 1)
x=self.Conv2d_1(x)
x=self.Conv2d_2(x)
x=self.Conv2d_3(x)
x=self.Conv2d_4(x)
x=self.MaxPool2d_1(x)
x=nn.Unflatten(dim=0,unflattened_size=(int(input_shape[0]),int(input_shape[1])))(x)
x=nn.Flatten(start_dim=2, end_dim=4)(x)
x,_=self.lstm_1(x)
x=x[:,-1,:]
x=nn.LeakyReLU()(self.fc_1(x))
x=nn.LeakyReLU()(self.fc_2(x))
x=nn.Sigmoid()(self.fc_3(x))
return x
if __name__=="__main__":
model=CNN_LSTM_FCNN()
torchinfo.summary(model,input_size=(64, 48, 224, 10))

42
src/model/FCNN.py Normal file
View File

@ -0,0 +1,42 @@
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
class FCNN(nn.Module):
def __init__(self,):
super(FCNN, self).__init__()
self.fc1 = nn.Linear(48* 224* 10, 4)
# self.fc2 = nn.Linear(40960, 10240)
# self.fc3 = nn.Linear(10240, 4096)
# self.fc4 = nn.Linear(4096, 2048)
# self.fc5 = nn.Linear(2048, 1024)
# self.fc6 = nn.Linear(1024, 512)
# self.fc7 = nn.Linear(512, 256)
# self.fc8 = nn.Linear(256, 128)
self.fc9 = nn.Linear(128, 4)
# self.fc4 = nn.Linear(10240, 4)
def forward(self, x):
x=x.reshape(x.shape[0],-1)
x = F.relu(self.fc1(x))
# x = F.relu(self.fc2(x))
# x = F.relu(self.fc3(x))
# x = F.relu(self.fc4(x))
# x = F.relu(self.fc5(x))
# x = F.relu(self.fc6(x))
# x = F.relu(self.fc7(x))
# x = F.relu(self.fc8(x))
# x = F.sigmoid(self.fc9(x))
# x = F.relu(self.fc2(x))
# x = F.relu(self.fc3(x))
# x = F.relu(self.fc4(x))
return x
if __name__=="__main__":
model=FCNN()
torchinfo.summary(model,input_size=(64, 48, 224, 10))

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

246
src/optimizer/trainable.py Normal file
View File

@ -0,0 +1,246 @@
import ray.train
import ray.tune
import ray.tune.logger
from data.pre_process import DataPreProcess
from data.spectral_dataset import load_dataset
import ray
import hydra
import random
import logging
import torch
import pickle
import pathlib
import tempfile
import time,os
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt
logger = logging.getLogger("ray")
def trainable(hydra_cfg,tune_cfg):
"""
tune_cfg为ray tune 传进来的可变的配置
hydra_cfg为Hydra传进来的所有配置
"""
#构建tensorboard日志
writer=SummaryWriter(log_dir=os.getcwd().replace("working_dirs","driver_artifacts"))
############################
## 原始数据预处理为数据集 ##
############################
t_1=time.time()
data_preprocess=DataPreProcess( hydra_cfg.raw_spectral_data_dir,hydra_cfg.raw_labels_dir,hydra_cfg.labels_name,hydra_cfg.dataset_dir,hydra.utils.instantiate(tune_cfg["choose_frame_spatial"]), hydra.utils.instantiate(tune_cfg["features_scaling"]),hydra.utils.instantiate(tune_cfg["labels_scaling"]),hydra_cfg.dataset.train_ratio,hydra_cfg.dataset.validate_ratio)
t_2=time.time()
logger.info(f"Preprocessed raw data costs {t_2-t_1}s")
############################
## 加载数据集为dataloader ##
############################
train_dataset,val_dataset=load_dataset(data_preprocess.dataset_save_dir)
trainloader = torch.utils.data.DataLoader(
train_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
)
valloader = torch.utils.data.DataLoader(
val_dataset, batch_size=int(tune_cfg["batch_size"]), shuffle=True, num_workers=hydra_cfg.dataset.num_worker
)
t_3=time.time()
logger.info(f"Dataloader costs {t_3-t_2}s")
# 确认训练设备
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda:0")
################################
## 模型,优化起,损失函数初始化 ##
################################
model=hydra.utils.instantiate(tune_cfg["model"])
model.to(device)
criterion=hydra.utils.instantiate(tune_cfg["criterion"])
optimizer=hydra.utils.instantiate(tune_cfg["optimizer"])
optimizer=optimizer(model.parameters())
t_4=time.time()
logger.info(f"Initilized model costs {t_4-t_3}s")
#BUG,暂不可用
################################
## 加载未训练完成的checkpoint ##
################################
checkpoint = ray.train.get_checkpoint()
if checkpoint:
with checkpoint.as_directory() as checkpoint_dir:
data_path = pathlib.Path() / "data.pkl"
with open(data_path, "rb") as fp:
checkpoint_state = pickle.load(fp)
start_epoch = checkpoint_state["epoch"]
model.load_state_dict(checkpoint_state["net_state_dict"])
optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
else:
start_epoch = 0
t_5=time.time()
logger.info(f"Check checkpoint costs {t_5-t_4}s")
################################
## 生成模型架构图写入tensorboad ##
################################
sample = train_dataset[0:4][0]
print(sample.shape)
writer.add_graph(model, torch.tensor(sample).to(device))
for epoch in range(start_epoch, hydra_cfg.train.max_epoch):
################################
## 模型训练 ##
################################
logger.info(f"Start epoch {epoch+1}")
train_loss = 0.0
train_steps = 0
train_hit_rate=None
t_epoch_start=time.time()
train_errors=None
train_outputs=None
train_labels=None
model.train()
for batch, (features, labels) in enumerate(trainloader):
t_iteration_start=time.time()
features=features.to(device)
labels=labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# Compute prediction and loss
outputs = model(features)
loss = criterion(outputs, labels)
labels_npy=labels.cpu().detach().numpy()
outputs_npy=outputs.cpu().detach().numpy()
# 生成自定义的指标
error,hit_rate=data_preprocess.get_metric(outputs_npy, labels_npy)
# Backpropagation
loss.backward()
optimizer.step()
# 记录我们自定义的指标
train_loss += loss.item()
train_steps += 1
if train_steps==1:
train_hit_rate=hit_rate
train_errors=error
train_outputs=outputs_npy
train_labels=labels_npy
else:
train_hit_rate=(train_steps-1)/train_steps *train_hit_rate+ 1/train_steps*hit_rate
train_errors=np.concatenate((train_errors,error),axis=0)
train_outputs=np.concatenate((train_outputs,outputs_npy),axis=0)
train_labels=np.concatenate((train_labels,labels_npy),axis=0)
t_iteration_end=time.time()
print("Training Epoch:[{}/{}],Iteration:{}/{},Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_steps,len(trainloader),loss.item(),t_iteration_end-t_iteration_start))
train_loss=train_loss/train_steps
################################
## 模型验证 ##
################################
val_loss = 0.0
val_steps = 0
val_hit_rate=None
t_epoch_train_end=time.time()
logger.info(f"Training costs {t_epoch_train_end-t_epoch_start}s")
model.eval()
for batch, (features, labels) in enumerate(valloader):
t_iteration_start=time.time()
with torch.no_grad():
features=features.to(device)
labels=labels.to(device)
outputs = model(features)
loss = criterion(outputs, labels)
error,hit_rate=data_preprocess.get_metric(outputs.cpu().detach().numpy(), labels.cpu().detach().numpy())
val_loss += loss.cpu().numpy()
val_steps += 1
if val_steps==1:
val_hit_rate=hit_rate
else:
val_hit_rate=(val_steps-1)/val_steps *val_hit_rate+ 1/val_steps*hit_rate
t_iteration_end=time.time()
print("Validate Iteration:{}/{},Loss:{}, Cost {}s".format(val_steps,len(valloader),loss.item(),t_iteration_end-t_iteration_start))
t_epoch_val_end=time.time()
logger.info(f"Validate costs {t_epoch_val_end-t_epoch_train_end}s")
#####################################################
## 每个epoch保存checkpoint并记录到tensorboard ##
#####################################################
checkpoint_data = {
"epoch": epoch,
"net_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
}
with tempfile.TemporaryDirectory() as checkpoint_dir:
data_path = pathlib.Path(checkpoint_dir) / "data.pkl"
with open(data_path, "wb") as fp:
pickle.dump(checkpoint_data, fp)
checkpoint = ray.train.Checkpoint.from_directory(checkpoint_dir)
reports={"val_loss": val_loss / val_steps,
"train_loss": train_loss,
}
for i,name in enumerate(hydra_cfg.labels_name):
reports[f"train_hit_rate_{name}"]=train_hit_rate[i]
reports[f"val_hit_rate_{name}"]=val_hit_rate[i]
writer.add_histogram(tag=f'train_error_{name}', values=train_errors[:,i], global_step=epoch)
fig, ax = plt.subplots()
ax.scatter(np.arange(train_outputs.shape[0]),train_outputs[:,i])
ax.scatter(np.arange(train_outputs.shape[0]),train_labels[:,i])
writer.add_figure(f'train_imgage_{name}', fig , epoch)
plt.close(fig)
ray.train.report(
reports,
checkpoint=checkpoint,
)
t_epoch_end=time.time()
logger.info(f"Save Checkpoint costs {t_epoch_end-t_epoch_val_end}s")
print("Training Epoch:[{}/{}], Average Loss:{}, Cost {}s".format(epoch+1,hydra_cfg.train.max_epoch,train_loss,t_epoch_end-t_epoch_start))
for i,name in enumerate(hydra_cfg.labels_name):
print(f"train_hit_rate_{name}: {train_hit_rate[i]} ",end=" ")
print(f"val_hit_rate_{name}: {val_hit_rate[i]} ",end=" ")

View File

@ -0,0 +1,71 @@
'''
@File : Naner_label_excel_standardization.py
@Time : 2024/08/09 10:25:21
@Author : Zhanpeng Yang
@Version : 0.0.1
@Contact : zhpyang@outlook.com
@Desc : 将钢厂给的从其系统中导出的Label文件转化为标准的格式供后续使用
'''
import pandas as pd
import os
if __name__ =="__main__":
raw_label_file_path="/home/admin/20240806-NanEr-5-8-data/labels/raw_labels/NanEr/5-8月份6#炉生产数据.xls"
save_dir="/home/admin/20240806-NanEr-5-8-data/labels/NanEr"
select_and_rename={
"炉号":"Furnace_Number",
"钢种": "Steel_Type",
"SL1开始时间":"TSC_start_time",
"SL1结束时间":"TSC_end_time",
"TSC温度":"TSC_T",
"TSC测量碳值":"TSC_C",
"TSC化验碳值":"TSC_C_lab",
"TSC化验磷值":"TSC_P_lab",
"TSC化验锰值":"TSC_Mn_lab",
"TSC化验铬值":"TSC_Cr_lab",
"SL2开始时间":"TSO_start_time",
"SL2结束时间":"TSO_end_time",
"TSO温度":"TSO_T",
"TSO测量碳值":"TSO_C",
"TSO化验碳值":"TSO_C_lab",
"TSO化验磷值":"TSO_P_lab",
"TSO化验锰值":"TSO_Mn_lab",
"TSO化验铬值":"TSO_Cr_lab",
}
##读取文件
raw_labels=pd.read_excel(raw_label_file_path)
#只选出我们需要的列,从第一行之后再选是因为第一行还是列名
selected_labels=raw_labels.loc[1:,select_and_rename.keys()]
#修改列名
selected_labels=selected_labels.rename(columns=select_and_rename)
#下述功能集成到数据预处理中。因为每一行但凡有0就删太严格应该要求指定的label有0就删
# # 选出有NULL的行
# null_rows=selected_labels.isnull().any(axis=1)
# # 选出有0的行
# zeros_rows=(selected_labels==0).any(axis=1) | (selected_labels=='0').any(axis=1)
# # 每一行但凡有NULL或者0都给删了
# selected_rows=~(null_rows|zeros_rows)
# selected_labels=selected_labels[selected_rows]
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_path=os.path.join(save_dir,f"{selected_labels.iloc[0]["TSC_start_time"].strftime("%Y-%m-%d")}_{selected_labels.iloc[-1]["TSO_end_time"].strftime("%m-%d")}_{selected_labels.shape[0]}.xlsx")
selected_labels.to_excel(save_path,index=False)
print(f"处理完毕的文件保存于:{save_path}")

View File

@ -0,0 +1,170 @@
import torch
import numpy as np
import h5py
import time
data=np.random.random((60000,448,1024)).astype(np.float32)
# #####numpy 保存,读取并遍历
# t_1=time.time()
# np.save("./temp/data.npy",data)
# t_2=time.time()
# save_time=t_2-t_1
# print(f"Numpy save time:{save_time}s")
# data=np.load("./temp/data.npy")
# t_3=time.time()
# load_time=t_3-t_2
# print(f"Numpy load time:{load_time}s")
# for i in range(data.shape[0]):
# tmp=data[i]
# # print(tmp)
# t_4=time.time()
# ergodic_time=t_4-t_3
# print(f"Numpy ergodic time:{ergodic_time}s")
# #####torch 保存,读取并遍历
# data_torch=torch.tensor(data)
# t_1=time.time()
# torch.save(data_torch,"./temp/data.pt")
# t_2=time.time()
# save_time=t_2-t_1
# print(f"Torch save time:{save_time}s")
# data_torch=torch.load("./temp/data.pt",weights_only=True)
# t_3=time.time()
# load_time=t_3-t_2
# print(f"Torch load time:{load_time}s")
# for i in range(data.shape[0]):
# tmp=data[i]
# t_4=time.time()
# ergodic_time=t_4-t_3
# print(f"Torch ergodic time:{ergodic_time}s")
#####h5 保存,读取并遍历
t_1=time.time()
with h5py.File("./temp/data.h5", 'w') as f:
h5_features = f.create_dataset(
'data',
data.shape,
dtype=np.float32,
)
h5_features[...]=data[...]
t_2=time.time()
save_time=t_2-t_1
print(f"HDF5 save time:{save_time}s")
hdf5_f = h5py.File("./temp/data.h5", 'r')
# load_data= hdf5_f["data"][...]
t_3=time.time()
load_time=t_3-t_2
print(f"HDF5 load time:{load_time}s")
for i in range(hdf5_f["data"].shape[0]):
tmp=hdf5_f["data"][i]
t_4=time.time()
ergodic_time=t_4-t_3
print(f"HDF5 ergodic time:{ergodic_time}s")
for i in range(hdf5_f["data"].shape[0]):
tmp=hdf5_f["data"][i]
t_5=time.time()
ergodic_time=t_5-t_4
print(f"HDF5 twice ergodic time:{ergodic_time}s")
hdf5_f.close()
#####h5 保存,读取并遍历
t_1=time.time()
t_2=time.time()
save_time=t_2-t_1
print(f"HDF5 [()] save time:{save_time}s")
hdf5_f = h5py.File("./temp/data_tmp.h5", 'r')
data=hdf5_f["data"][()]
# load_data= hdf5_f["data"][...]
t_3=time.time()
load_time=t_3-t_2
print(f"HDF5 [()] load time:{load_time}s")
for i in range(data.shape[0]):
tmp=data[i]
t_4=time.time()
ergodic_time=t_4-t_3
print(f"HDF5 [()] ergodic time:{ergodic_time}s")
for i in range(data.shape[0]):
tmp=data[i]
t_5=time.time()
ergodic_time=t_5-t_4
print(f"HDF5 [()] twice ergodic time:{ergodic_time}s")
# for i in range(hdf5_f["data"].shape[0]):
# tmp=load_data[i]
# t_6=time.time()
# ergodic_time=t_6-t_5
# print(f"HDF5 loaded ergodic time:{ergodic_time}s")
hdf5_f.close()
# #####h5 Optimized 保存,读取并遍历
# t_1=time.time()
# with h5py.File("./temp/data_optimzed.h5", 'w') as f:
# h5_features = f.create_dataset(
# 'data',
# data.shape,
# maxshape=(None,448,1024),
# chunks=(1,448,1024), # 手动设置 chunk 大小为一次数据的大小
# # compression='gzip',
# # compression_opts=1,
# dtype=np.float32,
# )
# h5_features[...]=data[...]
# t_2=time.time()
# save_time=t_2-t_1
# print(f"HDF5 Optimized save time:{save_time}s")
# hdf5_f = h5py.File("./temp/data_optimzed.h5", 'r',
# rdcc_nbytes=1024**2*1024*20, # 10G 缓存总大小,单位字节。应设置为 chunk 的整数倍
# rdcc_w0=0, # 缓存淘汰策略。值越接近 0优先淘汰最近最少使用的 chunk。值越接近 1优先淘汰已完全读取或写入的块
# # rdcc_nslots=1e8, # 定位缓存 chunk 位置的坐标长度。推荐为缓存 chunk 数的 10 倍,为达最佳性能需要 100 倍。默认为 521
# )
# data=hdf5_f["data"]
# loaddata=data[...]
# t_3=time.time()
# load_time=t_3-t_2
# print(f"HDF5 Optimized load time:{load_time}s")
# for i in range(data.shape[0]):
# tmp=data[i]
# t_4=time.time()
# ergodic_time=t_4-t_3
# print(f"HDF5 Optimized ergodic time:{ergodic_time}s")
# for i in range(data.shape[0]):
# tmp=data[i]
# # print(tmp)
# t_5=time.time()
# ergodic_time=t_5-t_4
# print(f"HDF5 Optimized twice ergodic time:{ergodic_time}s")
# hdf5_f.close