pytorch

package
v0.5.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 5, 2022 License: Apache-2.0 Imports: 43 Imported by: 0

Documentation

Index

Constants

View Source
const (
	AnnotationCheckpointRequestedVersion = v1.KubeDLPrefix + "/ckpt-requested-version"
	AnnotationCheckpointCompletedVersion = v1.KubeDLPrefix + "/ckpt-completed-version"
	AnnotationReadyToStartWorker         = v1.KubeDLPrefix + "/ready-to-start-worker"
	AnnotationImmediatelyStartWorker     = v1.KubeDLPrefix + "/immediately-start-worker"
	AnnotationWorldSize                  = v1.KubeDLPrefix + "/world-size"
)
View Source
const (
	CheckpointStartReason    = "CheckpointStarted"
	CheckpointFinishedReason = "CheckpointSucceeded"
	CheckpointFailedReason   = "CheckpointFailed"
)

Variables

This section is empty.

Functions

func AddImageWarmupForWorker added in v0.4.3

func AddImageWarmupForWorker(podTemplate *corev1.PodTemplateSpec, mainContainerName string)

func AddMasterWaiterForWorker added in v0.4.3

func AddMasterWaiterForWorker(podTemplate *corev1.PodTemplateSpec, param InitContainerParam) error

func ContainMasterSpec

func ContainMasterSpec(job *training.PyTorchJob) bool

Types

type InitContainerParam added in v0.4.3

type InitContainerParam struct {
	MasterAddr         string
	InitContainerImage string
}

type PytorchJobReconciler

type PytorchJobReconciler struct {
	client.Client
	// contains filtered or unexported fields
}

PytorchJobReconciler reconcile a PytorchJob object

func (*PytorchJobReconciler) CheckpointIfNecessary added in v0.4.3

func (r *PytorchJobReconciler) CheckpointIfNecessary(job interface{}, pods []*corev1.Pod) (completed bool, err error)

CheckpointIfNecessary triggers checkpoint when workers are going to be preempted or evicted, notify AIMaster to checkpoint and drain out victim pods after succeed. Checkpoint requests contains a `version` to distinguish from different progresses, and controller guarantees that 'checkpoint-version' <= 'job generation'. When preemption happens controller triggers a new round of checkpoint and take job generation as its version, and self-increase generation after checkpoint succeed.

func (*PytorchJobReconciler) ControllerName

func (r *PytorchJobReconciler) ControllerName() string

func (*PytorchJobReconciler) DeleteJob

func (r *PytorchJobReconciler) DeleteJob(job interface{}) error

DeleteJob deletes the job

func (*PytorchJobReconciler) EnableElasticScaling added in v0.4.3

func (r *PytorchJobReconciler) EnableElasticScaling(job metav1.Object, runPolicy *v1.RunPolicy) bool

func (*PytorchJobReconciler) GetAPIGroupVersion

func (r *PytorchJobReconciler) GetAPIGroupVersion() schema.GroupVersion

GetAPIGroupVersion returns the GroupVersion of the API

func (*PytorchJobReconciler) GetAPIGroupVersionKind

func (r *PytorchJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind

GetAPIGroupVersionKind returns the GroupVersionKind of the API

func (*PytorchJobReconciler) GetDefaultContainerName

func (r *PytorchJobReconciler) GetDefaultContainerName() string

GetDefaultContainerName returns the default container name in pod

func (*PytorchJobReconciler) GetDefaultContainerPortName

func (r *PytorchJobReconciler) GetDefaultContainerPortName() string

GetDefaultContainerPortName Get the default container port name

func (*PytorchJobReconciler) GetDefaultContainerPortNumber

func (r *PytorchJobReconciler) GetDefaultContainerPortNumber() int32

GetDefaultContainerPortNumber get the default container port number

func (*PytorchJobReconciler) GetGroupNameLabelValue

func (r *PytorchJobReconciler) GetGroupNameLabelValue() string

GetGroupNameLabelValue returns the Group Name(value) in the labels of the job

func (*PytorchJobReconciler) GetJobFromAPIClient

func (r *PytorchJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)

GetJobFromAPIClient returns the Job from API server

func (*PytorchJobReconciler) GetJobFromInformerCache

func (r *PytorchJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)

GetJobFromInformerCache returns the Job from Informer Cache

func (*PytorchJobReconciler) GetNodeForModelOutput added in v0.4.0

func (r *PytorchJobReconciler) GetNodeForModelOutput(pods []*corev1.Pod) (nodeName string)

func (*PytorchJobReconciler) GetPodsForJob

func (r *PytorchJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)

GetPodsForJob returns the pods managed by the job. This can be achieved by selecting pods using label key "job-name" i.e. all pods created by the job will come with label "job-name" = <this_job_name>

func (*PytorchJobReconciler) GetReconcileOrders

func (r *PytorchJobReconciler) GetReconcileOrders() []v1.ReplicaType

func (*PytorchJobReconciler) GetServicesForJob

func (r *PytorchJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)

GetServicesForJob returns the services managed by the job. This can be achieved by selecting services using label key "job-name" i.e. all services created by the job will come with label "job-name" = <this_job_name>

func (*PytorchJobReconciler) IsMasterRole

func (r *PytorchJobReconciler) IsMasterRole(replicas map[v1.ReplicaType]*v1.ReplicaSpec, rtype v1.ReplicaType, index int) bool

IsMasterRole returns if this replica type with index specified is a master role. MasterRole pod will have "job-role=master" set in its label

func (*PytorchJobReconciler) Reconcile

func (r *PytorchJobReconciler) Reconcile(_ context.Context, req ctrl.Request) (ctrl.Result, error)

func (*PytorchJobReconciler) ScaleIn added in v0.4.3

func (r *PytorchJobReconciler) ScaleIn(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, activePods []*corev1.Pod, activeServices []*corev1.Service) error

func (*PytorchJobReconciler) ScaleOut added in v0.4.3

func (r *PytorchJobReconciler) ScaleOut(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, activePods []*corev1.Pod, activeServices []*corev1.Service) error

func (*PytorchJobReconciler) SetClusterSpec

func (r *PytorchJobReconciler) SetClusterSpec(ctx context.Context, job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error

SetClusterSpec sets the cluster spec for the pod

func (*PytorchJobReconciler) SetupWithManager

func (r *PytorchJobReconciler) SetupWithManager(mgr ctrl.Manager) error

func (*PytorchJobReconciler) UpdateJobStatus

func (r *PytorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, jobStatus *v1.JobStatus, restart bool) error

UpdateJobStatus updates the job status and job conditions

func (*PytorchJobReconciler) UpdateJobStatusInApiServer

func (r *PytorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *v1.JobStatus) error

UpdateJobStatusInApiServer updates the job status in API server

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL