changes to save and test the models

init-22 · init-22 · commit 9837910dfbb1 · 2025-07-02T14:48:19.000Z
diff --git a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
@@ -105,7 +105,7 @@ def init_model_fn(
         {'params': params_rng, 'dropout': dropout_rng},
         jnp.ones(input_shape, jnp.float32))
     initial_params = initial_variables['params']
-    initial_params = use_pytorch_weights_inplace(initial_params, file_name="/results/pytorch_base_model_criteo1tb_24_june.pth")
+    initial_params = use_pytorch_weights_inplace(initial_params, file_name="/results/pytorch_base_model_criteo1tb_1_july.pth")
     self._param_shapes = param_utils.jax_param_shapes(initial_params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
     return jax_utils.replicate(initial_params), None
diff --git a/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py
@@ -88,6 +88,7 @@ def init_model_fn(
         dropout_rate=dropout_rate,
         use_layer_norm=self.use_layer_norm,
         embedding_init_multiplier=self.embedding_init_multiplier)
+    torch.save(model.state_dict(), '/results/pytorch_base_model_criteo1tb_1_july.pth')
     self._param_shapes = param_utils.pytorch_param_shapes(model)
     self._param_types = param_utils.pytorch_param_types(self._param_shapes)
     model.to(DEVICE)
diff --git a/custom_pytorch_jax_converter.py b/custom_pytorch_jax_converter.py
@@ -26,6 +26,7 @@ def use_pytorch_weights_inplace(jax_params, file_name=None, replicate=False):
     # Load PyTorch state_dict
     state_dict = torch.load(file_name)
     print(state_dict.keys())
+
     # Convert PyTorch tensors to NumPy arrays
     numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
 
@@ -59,6 +60,7 @@ def deep_copy_to_cpu(pytree):
     # Load PyTorch state_dict lazily to CPU
     state_dict = torch.load(file_name, map_location='cpu')
     print(state_dict.keys())
+   
     # Convert PyTorch tensors to NumPy arrays
     numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
 
@@ -128,6 +130,7 @@ def move_to_cpu(tree):
 def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
     """Compares two JAX PyTrees of weights and logs where they differ, safely handling PMAP replication."""
     # Attempt to unreplicate if needed
+
     params1 = maybe_unreplicate(params1)
     params2 = maybe_unreplicate(params2)
 
diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
@@ -171,12 +171,14 @@ def update_params(workload: spec.Workload,
         }, global_step)
     
   # Log the number of parameters.
-  if global_step % 100 == 0 and workload.metrics_logger is not None:
-    date_ = "2025-06-14"
-    file_name = f"/results/schedule_free_test_pytorch_weights/criteo1tb_{date_}_after_{global_step}_steps.pth"
+  if global_step % 100 == 0:
+    date_ = "2025-07-01"
+    file_name = f"/results/schedule_free_pytorch_weights/criteo1tb_{date_}_after_{global_step}_steps.pth"
     params = use_pytorch_weights_cpu_copy(new_params, file_name=file_name, replicate=True)
     are_weights_equal(new_params, params)
     del params
+  
+  breakpoint()
 
   return (new_optimizer_state, opt_update_fn), new_params, new_model_state
 
diff --git a/submission_runner.py b/submission_runner.py
@@ -372,6 +372,19 @@ def train_once(
             rng=update_rng,
             **({'train_state': MappingProxyType(train_state)}
                if needs_train_state else {}))
+        if FLAGS.framework=="pytorch" and global_step % 100 == 0:
+          if global_step > 1000:
+            import torch.distributed as dist
+            import sys
+            dist.destroy_process_group()
+            sys.exit(0)
+          # Save the PyTorch weights to a file every 100 steps.
+          date_ = datetime.date.today().strftime('%Y-%m-%d')
+          file_name = os.path.join(
+              log_dir, f'/results/schedule_free_pytorch_weights/{workload_name}_{date_}_after_{global_step}_steps.pth')
+          logging.info(f'Saving PyTorch weights to {file_name}')
+          torch.save(model_params.module.state_dict(), file_name)
+
     except spec.TrainingCompleteError:
       train_state['training_complete'] = True
     global_step += 1
@@ -383,131 +396,131 @@ def train_once(
     train_state['accumulated_submission_time'] += (
         train_step_end_time - train_state['last_step_end_time'])
 
-    # Check if submission is eligible for an untimed eval.
-    if ((train_step_end_time - train_state['last_eval_time']) >=
-        workload.eval_period_time_sec or train_state['training_complete']):
-
-      # Prepare for evaluation (timed).
-      if prepare_for_eval is not None:
-
-        with profiler.profile('Prepare for eval'):
-          del batch
-          prepare_for_eval_start_time = get_time()
-          optimizer_state, model_params, model_state = prepare_for_eval(
-              workload=workload,
-              current_param_container=model_params,
-              current_params_types=workload.model_params_types,
-              model_state=model_state,
-              hyperparameters=hyperparameters,
-              loss_type=workload.loss_type,
-              optimizer_state=optimizer_state,
-              eval_results=eval_results,
-              global_step=global_step,
-              rng=prep_eval_rng)
-          prepare_for_eval_end_time = get_time()
-
-        # Update sumbission time.
-        train_state['accumulated_submission_time'] += (
-            prepare_for_eval_end_time - prepare_for_eval_start_time)
-
-      # Check if time is remaining,
-      # use 1.5x the runtime budget for the self-tuning ruleset.
-      max_allowed_runtime_sec = (
-          workload.max_allowed_runtime_sec if FLAGS.tuning_ruleset == 'external'
-          else 1.5 * workload.max_allowed_runtime_sec)
-      train_state['is_time_remaining'] = (
-          train_state['accumulated_submission_time'] < max_allowed_runtime_sec)
-
-      # Eval if time is remaining (untimed).
-      if train_state['is_time_remaining']:
-
-        with profiler.profile('Evaluation'):
-          _reset_cuda_mem()
-
-          try:
-            eval_start_time = get_time()
-            latest_eval_result = workload.eval_model(global_eval_batch_size,
-                                                     model_params,
-                                                     model_state,
-                                                     eval_rng,
-                                                     data_dir,
-                                                     imagenet_v2_data_dir,
-                                                     global_step)
-            # Check if targets reached.
-            # Note that this is one of the stopping conditions for the length of
-            # a training run. To score the run we only consider the time
-            # to validation target retrospectively.
-            train_state['validation_goal_reached'] = (
-                workload.has_reached_validation_target(latest_eval_result) or
-                train_state['validation_goal_reached'])
-            train_state['test_goal_reached'] = (
-                workload.has_reached_test_target(latest_eval_result) or
-                train_state['test_goal_reached'])
-            goals_reached = (
-                train_state['validation_goal_reached'] and
-                train_state['test_goal_reached'])
-            # Save last eval time.
-            eval_end_time = get_time()
-            train_state['last_eval_time'] = eval_end_time
-
-            # Accumulate eval time.
-            train_state[
-                'accumulated_eval_time'] += eval_end_time - eval_start_time
-
-            # Add times to eval results for logging.
-            latest_eval_result['score'] = (
-                train_state['accumulated_submission_time'])
-            latest_eval_result[
-                'total_duration'] = eval_end_time - global_start_time
-            latest_eval_result['accumulated_submission_time'] = train_state[
-                'accumulated_submission_time']
-            latest_eval_result['accumulated_eval_time'] = train_state[
-                'accumulated_eval_time']
-            latest_eval_result['accumulated_logging_time'] = train_state[
-                'accumulated_logging_time']
-            time_since_start = latest_eval_result['total_duration']
-            logging.info(f'Time since start: {time_since_start:.2f}s, '
-                         f'\tStep: {global_step}, \t{latest_eval_result}')
-            eval_results.append((global_step, latest_eval_result))
-
-            logging_start_time = get_time()
-
-            if log_dir is not None and RANK == 0:
-              metrics_logger.append_scalar_metrics(
-                  latest_eval_result,
-                  global_step=global_step,
-                  preemption_count=preemption_count,
-                  is_eval=True,
-              )
-              if save_checkpoints:
-                checkpoint_utils.save_checkpoint(
-                    framework=FLAGS.framework,
-                    optimizer_state=optimizer_state,
-                    model_params=model_params,
-                    model_state=model_state,
-                    train_state=train_state,
-                    eval_results=eval_results,
-                    global_step=global_step,
-                    preemption_count=preemption_count,
-                    checkpoint_dir=log_dir,
-                    save_intermediate_checkpoints=FLAGS
-                    .save_intermediate_checkpoints)
-
-            logging_end_time = get_time()
-            train_state['accumulated_logging_time'] += (
-                logging_end_time - logging_start_time)
-
-            _reset_cuda_mem()
-
-          except RuntimeError as e:
-            logging.exception(f'Eval step {global_step} error.\n')
-            if 'out of memory' in str(e):
-              logging.warning(
-                  'Error: GPU out of memory during eval during step '
-                  f'{global_step}, error : {str(e)}.')
-              _reset_cuda_mem()
-
-    train_state['last_step_end_time'] = get_time()
+  #   # Check if submission is eligible for an untimed eval.
+  #   if ((train_step_end_time - train_state['last_eval_time']) >=
+  #       workload.eval_period_time_sec or train_state['training_complete']):
+
+  #     # Prepare for evaluation (timed).
+  #     if prepare_for_eval is not None:
+
+  #       with profiler.profile('Prepare for eval'):
+  #         del batch
+  #         prepare_for_eval_start_time = get_time()
+  #         optimizer_state, model_params, model_state = prepare_for_eval(
+  #             workload=workload,
+  #             current_param_container=model_params,
+  #             current_params_types=workload.model_params_types,
+  #             model_state=model_state,
+  #             hyperparameters=hyperparameters,
+  #             loss_type=workload.loss_type,
+  #             optimizer_state=optimizer_state,
+  #             eval_results=eval_results,
+  #             global_step=global_step,
+  #             rng=prep_eval_rng)
+  #         prepare_for_eval_end_time = get_time()
+
+  #       # Update sumbission time.
+  #       train_state['accumulated_submission_time'] += (
+  #           prepare_for_eval_end_time - prepare_for_eval_start_time)
+
+  #     # Check if time is remaining,
+  #     # use 1.5x the runtime budget for the self-tuning ruleset.
+  #     max_allowed_runtime_sec = (
+  #         workload.max_allowed_runtime_sec if FLAGS.tuning_ruleset == 'external'
+  #         else 1.5 * workload.max_allowed_runtime_sec)
+  #     train_state['is_time_remaining'] = (
+  #         train_state['accumulated_submission_time'] < max_allowed_runtime_sec)
+
+  #     # Eval if time is remaining (untimed).
+  #     if train_state['is_time_remaining']:
+
+  #       with profiler.profile('Evaluation'):
+  #         _reset_cuda_mem()
+
+  #         try:
+  #           eval_start_time = get_time()
+  #           latest_eval_result = workload.eval_model(global_eval_batch_size,
+  #                                                    model_params,
+  #                                                    model_state,
+  #                                                    eval_rng,
+  #                                                    data_dir,
+  #                                                    imagenet_v2_data_dir,
+  #                                                    global_step)
+  #           # Check if targets reached.
+  #           # Note that this is one of the stopping conditions for the length of
+  #           # a training run. To score the run we only consider the time
+  #           # to validation target retrospectively.
+  #           train_state['validation_goal_reached'] = (
+  #               workload.has_reached_validation_target(latest_eval_result) or
+  #               train_state['validation_goal_reached'])
+  #           train_state['test_goal_reached'] = (
+  #               workload.has_reached_test_target(latest_eval_result) or
+  #               train_state['test_goal_reached'])
+  #           goals_reached = (
+  #               train_state['validation_goal_reached'] and
+  #               train_state['test_goal_reached'])
+  #           # Save last eval time.
+  #           eval_end_time = get_time()
+  #           train_state['last_eval_time'] = eval_end_time
+
+  #           # Accumulate eval time.
+  #           train_state[
+  #               'accumulated_eval_time'] += eval_end_time - eval_start_time
+
+  #           # Add times to eval results for logging.
+  #           latest_eval_result['score'] = (
+  #               train_state['accumulated_submission_time'])
+  #           latest_eval_result[
+  #               'total_duration'] = eval_end_time - global_start_time
+  #           latest_eval_result['accumulated_submission_time'] = train_state[
+  #               'accumulated_submission_time']
+  #           latest_eval_result['accumulated_eval_time'] = train_state[
+  #               'accumulated_eval_time']
+  #           latest_eval_result['accumulated_logging_time'] = train_state[
+  #               'accumulated_logging_time']
+  #           time_since_start = latest_eval_result['total_duration']
+  #           logging.info(f'Time since start: {time_since_start:.2f}s, '
+  #                        f'\tStep: {global_step}, \t{latest_eval_result}')
+  #           eval_results.append((global_step, latest_eval_result))
+
+  #           logging_start_time = get_time()
+
+  #           if log_dir is not None and RANK == 0:
+  #             metrics_logger.append_scalar_metrics(
+  #                 latest_eval_result,
+  #                 global_step=global_step,
+  #                 preemption_count=preemption_count,
+  #                 is_eval=True,
+  #             )
+  #             if save_checkpoints:
+  #               checkpoint_utils.save_checkpoint(
+  #                   framework=FLAGS.framework,
+  #                   optimizer_state=optimizer_state,
+  #                   model_params=model_params,
+  #                   model_state=model_state,
+  #                   train_state=train_state,
+  #                   eval_results=eval_results,
+  #                   global_step=global_step,
+  #                   preemption_count=preemption_count,
+  #                   checkpoint_dir=log_dir,
+  #                   save_intermediate_checkpoints=FLAGS
+  #                   .save_intermediate_checkpoints)
+
+  #           logging_end_time = get_time()
+  #           train_state['accumulated_logging_time'] += (
+  #               logging_end_time - logging_start_time)
+
+  #           _reset_cuda_mem()
+
+  #         except RuntimeError as e:
+  #           logging.exception(f'Eval step {global_step} error.\n')
+  #           if 'out of memory' in str(e):
+  #             logging.warning(
+  #                 'Error: GPU out of memory during eval during step '
+  #                 f'{global_step}, error : {str(e)}.')
+  #             _reset_cuda_mem()
+
+  #   train_state['last_step_end_time'] = get_time()
 
   metrics = {'eval_results': eval_results, 'global_step': global_step}