@@ -385,33 +385,44 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context
dataActualPath = setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/dataset"
}
log.Info("dataActualPath=" + dataActualPath)
bootfile := ""
runParms := make(map[string]interface{}, 0)
if modelConvert.SrcEngine == PYTORCH_ENGINE {
if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchOnnxBootFile)
bootfile = setting.ModelConvert.PytorchOnnxBootFile
runParms = getGpuModelConvertRunParams(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchOnnxBootFile)
//command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchOnnxBootFile)
} else if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchTrTBootFile)
bootfile = setting.ModelConvert.PytorchTrTBootFile
runParms = getGpuModelConvertRunParams(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchTrTBootFile)
//command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchTrTBootFile)
} else {
return errors.New("Not support the format.")
}
} else if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
IMAGE_URL = setting.ModelConvert.GPU_TENSORFLOW_IMAGE
if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.TensorFlowGpuBootFile)
bootfile = setting.ModelConvert.TensorFlowGpuBootFile
runParms = getGpuModelConvertRunParams(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.TensorFlowGpuBootFile)
//command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.TensorFlowGpuBootFile)
} else {
return errors.New("Not support the format.")
}
} else if modelConvert.SrcEngine == PADDLE_ENGINE {
IMAGE_URL = setting.ModelConvert.GPU_PADDLE_IMAGE
if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PaddleOnnxBootFile)
bootfile = setting.ModelConvert.PaddleOnnxBootFile
runParms = getGpuModelConvertRunParams(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PaddleOnnxBootFile)
//command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PaddleOnnxBootFile)
} else {
return errors.New("Not support the format.")
}
} else if modelConvert.SrcEngine == MXNET_ENGINE {
IMAGE_URL = setting.ModelConvert.GPU_MXNET_IMAGE
if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.MXnetOnnxBootFile)
bootfile = setting.ModelConvert.MXnetOnnxBootFile
runParms = getGpuModelConvertRunParams(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.MXnetOnnxBootFile)
//command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.MXnetOnnxBootFile)
} else {
return errors.New("Not support the format.")
}
@@ -505,7 +516,7 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context
},
},
PreTrainModel: nil,
BootFile: "" ,
BootFile: bootfile ,
OutPut: []entity.ContainerData{{
ContainerPath: "/tmp/output",
ReadOnly: false,
@@ -534,7 +545,7 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context
reqJson, _ := json.Marshal(req)
log.Info("reqJson=" + string(reqJson))
jobResult, err := createGrampusTrainJob(req, command)
jobResult, err := createGrampusTrainJob(req, command, runParms )
if err != nil {
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
@@ -594,8 +605,8 @@ func getGrampusTrainTaskConfig() *entity.AITaskBaseConfig {
return config
}
func createGrampusTrainJob(req entity.CreateTrainTaskRequest, exeCommand string) (*models.CreateGrampusJobResponse, error) {
jobResult, err := grampus.CreateJob(convertTrainReq2Grampus(req, exeCommand))
func createGrampusTrainJob(req entity.CreateTrainTaskRequest, exeCommand string, runParam map[string]interface{} ) (*models.CreateGrampusJobResponse, error) {
jobResult, err := grampus.CreateJob(convertTrainReq2Grampus(req, exeCommand, runParam ))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
return nil, err
@@ -603,19 +614,19 @@ func createGrampusTrainJob(req entity.CreateTrainTaskRequest, exeCommand string)
return jobResult, nil
}
func convertTrainReq2Grampus(req entity.CreateTrainTaskRequest, exeCommand string) models.CreateGrampusJobRequest {
command := generateGrampusTrainCommand(req, exeCommand)
func convertTrainReq2Grampus(req entity.CreateTrainTaskRequest, exeCommand string, runParam map[string]interface{} ) models.CreateGrampusJobRequest {
// command := generateGrampusTrainCommand(req, exeCommand)
command := ""
tasks := make([]models.GrampusTasks, len(req.Tasks))
for i := 0; i < len(req.Tasks); i++ {
t := req.Tasks[i]
tasks[i] = convertTrainTask2Grampus(t, command)
tasks[i] = convertTrainTask2Grampus(t, command, runParam )
}
return models.CreateGrampusJobRequest{Name: req.Name, Tasks: tasks}
}
func convertTrainTask2Grampus(t entity.TrainTask, command string) models.GrampusTasks {
func convertTrainTask2Grampus(t entity.TrainTask, command string, runParam map[string]interface{} ) models.GrampusTasks {
return models.GrampusTasks{
Name: t.Name,
ResourceSpecId: t.ResourceSpecId,
@@ -630,6 +641,7 @@ func convertTrainTask2Grampus(t entity.TrainTask, command string) models.Grampus
BootFile: t.BootFile,
OutPut: convertContainerArray2Grampus(t.OutPut),
WorkServerNumber: t.WorkServerNumber,
RunParams: runParam,
}
}
@@ -713,8 +725,7 @@ func buildUnzipCodeCommand(codeConfigPath, codeFilePath, computeSource string) *
Next(entity.NewCommand("cd", codeConfigPath)).
Next(entity.NewCommand("unzip", "-q", codeFilePath)).
Next(entity.NewCommand("echo", "'unzip code finished'")).
Next(entity.NewCommand("ls", "-l")).
Next(entity.NewCommand("ls", "-l", "mnist_pytorchexample_gpu"))
Next(entity.NewCommand("ls", "-l"))
return builder
}
func buildUnzipDatasetCommand(datasets []entity.ContainerData, datasetPath, computeSource string) *entity.CommandBuilder {
@@ -806,6 +817,33 @@ func getGpuModelConvertCommand(name string, modelFile string, modelConvert *mode
return command
}
func getGpuModelConvertRunParams(name string, modelFile string, modelConvert *models.AiModelConvert, bootfile string) map[string]interface{} {
re := make(map[string]interface{}, 0)
inputshape := strings.Split(modelConvert.InputShape, ",")
n := "256"
c := "1"
h := "28"
w := "28"
if len(inputshape) == 4 {
n = inputshape[0]
c = inputshape[1]
h = inputshape[2]
w = inputshape[3]
}
re["model"] = modelFile
re["n"] = n
re["c"] = c
re["h"] = h
re["w"] = w
if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
if modelConvert.NetOutputFormat == NetOutputFormat_FP16 {
re["fp16"] = "True"
}
}
return re
}
func DeleteModelConvert(ctx *context.Context) {
log.Info("delete model convert start.")
id := ctx.Params(":id")