Best model after epoch (#46)

natuan · web-flow · commit 15ba9b7d27a7 · 2022-05-27T16:43:26.000-04:00
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -1725,7 +1725,12 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
 
         # Determine the new best metric / best model checkpoint
-        if metrics is not None and self.args.metric_for_best_model is not None:
+        if (
+            metrics is not None
+            and self.args.metric_for_best_model is not None
+            and self.args.best_model_after_epoch is not None
+            and self.state.epoch > self.args.best_model_after_epoch
+        ):
             metric_to_check = self.args.metric_for_best_model
             if not metric_to_check.startswith("eval_"):
                 metric_to_check = f"eval_{metric_to_check}"
@@ -2661,7 +2666,9 @@ def prediction_step(
                     logits = smp_nested_concat(logits_mb)
             else:
                 if has_labels:
-                    with self.autocast_smart_context_manager(enabled=hasattr(self, "scaler") and self.scaler.is_enabled()):
+                    with self.autocast_smart_context_manager(
+                        enabled=hasattr(self, "scaler") and self.scaler.is_enabled()
+                    ):
                         loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
                     loss = loss.mean().detach()
 
@@ -2671,7 +2678,9 @@ def prediction_step(
                         logits = outputs[1:]
                 else:
                     loss = None
-                    with self.autocast_smart_context_manager(enabled=hasattr(self, "scaler") and self.scaler.is_enabled()):
+                    with self.autocast_smart_context_manager(
+                        enabled=hasattr(self, "scaler") and self.scaler.is_enabled()
+                    ):
                         outputs = model(**inputs)
                     if isinstance(outputs, dict):
                         logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -640,6 +640,10 @@ class TrainingArguments:
         default=False,
         metadata={"help": "Whether or not to load the best model found during training at the end of training."},
     )
+    best_model_after_epoch: int = field(
+        default=None,
+        metadata={"help": "Epoch after which best model will be saved."},
+    )
     metric_for_best_model: Optional[str] = field(
         default=None, metadata={"help": "The metric to use to compare two different models."}
     )
@@ -748,12 +752,8 @@ class TrainingArguments:
         metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer"},
     )
     modifier_log_frequency: float = field(
-        default = 0.1,
-        metadata={
-            "help": (
-                "How often to log SparseML modifier data, in number of epochs or fraction of epochs"
-            )
-        }
+        default=0.1,
+        metadata={"help": ("How often to log SparseML modifier data, in number of epochs or fraction of epochs")},
     )
 
     def __post_init__(self):

Original file line number	Diff line number	Diff line change
`@@ -640,6 +640,10 @@ class TrainingArguments:`
`640`	`640`	`default=False,`
`641`	`641`	`metadata={"help": "Whether or not to load the best model found during training at the end of training."},`
`642`	`642`	`)`
	`643`	`+ best_model_after_epoch: int = field(`
	`644`	`+ default=None,`
	`645`	`+ metadata={"help": "Epoch after which best model will be saved."},`
	`646`	`+ )`
`643`	`647`	`metric_for_best_model: Optional[str] = field(`
`644`	`648`	`default=None, metadata={"help": "The metric to use to compare two different models."}`
`645`	`649`	`)`
`@@ -748,12 +752,8 @@ class TrainingArguments:`
`748`	`752`	`metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer"},`
`749`	`753`	`)`
`750`	`754`	`modifier_log_frequency: float = field(`
`751`		`- default = 0.1,`
`752`		`- metadata={`
`753`		`- "help": (`
`754`		`- "How often to log SparseML modifier data, in number of epochs or fraction of epochs"`
`755`		`- )`
`756`		`- }`
	`755`	`+ default=0.1,`
	`756`	`+ metadata={"help": ("How often to log SparseML modifier data, in number of epochs or fraction of epochs")},`
`757`	`757`	`)`
`758`	`758`
`759`	`759`	`def __post_init__(self):`