@@ -123,7 +123,7 @@ def _create_accumulators(self, block, parameters):
123123 """
124124 pass
125125
126- def _finish_update (self , block , parameters_and_grads ):
126+ def _finish_update (self , block ):
127127 """Finish any custom updates needed
128128 before completing an optimization step
129129
@@ -132,7 +132,7 @@ def _finish_update(self, block, parameters_and_grads):
132132 parameters: list of parameter variables for the optimizer
133133
134134 Returns:
135- None
135+ list of finish ops or None
136136 """
137137 pass
138138
@@ -237,7 +237,7 @@ def _create_optimization_pass(self,
237237
238238 # Get custom finish ops for subclasses
239239 # FIXME: Need to fix this once we figure out how to handle dependencies
240- self ._finish_update (loss .block , parameters_and_grads )
240+ self ._finish_update (loss .block )
241241
242242 end = len (global_block .ops )
243243 return global_block .slice_ops (start , end )
@@ -487,8 +487,6 @@ class AdamOptimizer(Optimizer):
487487 """
488488 _moment1_acc_str = "moment1"
489489 _moment2_acc_str = "moment2"
490- _beta1_pow_acc_str = "beta1_pow_acc"
491- _beta2_pow_acc_str = "beta2_pow_acc"
492490
493491 def __init__ (self ,
494492 learning_rate = 0.001 ,
@@ -510,22 +508,32 @@ def __init__(self,
510508 def _create_accumulators (self , block , parameters ):
511509 assert isinstance (block , framework .Block )
512510
511+ main_block = block .program .global_block ()
512+ # Create beta1 and beta2 power tensors
513+ beta_shape = [1 ]
514+ self ._beta1_pow_acc = self .helper .create_global_variable (
515+ name = unique_name .generate ('beta1_pow_acc' ),
516+ dtype = 'float32' if self ._dtype == None else self ._dtype ,
517+ shape = beta_shape ,
518+ lod_level = 0 ,
519+ persistable = True )
520+ self .helper .set_variable_initializer (
521+ self ._beta1_pow_acc , initializer = Constant (self ._beta1 ))
522+
523+ self ._beta2_pow_acc = self .helper .create_global_variable (
524+ name = unique_name .generate ('beta2_pow_acc' ),
525+ dtype = 'float32' if self ._dtype == None else self ._dtype ,
526+ shape = beta_shape ,
527+ lod_level = 0 ,
528+ persistable = True )
529+
530+ self .helper .set_variable_initializer (
531+ self ._beta2_pow_acc , initializer = Constant (self ._beta2 ))
532+
513533 # Create accumulator tensors for first and second moments
514534 for p in parameters :
515535 self ._add_accumulator (self ._moment1_acc_str , p )
516536 self ._add_accumulator (self ._moment2_acc_str , p )
517- self ._add_accumulator (
518- name = self ._beta1_pow_acc_str ,
519- param = p ,
520- dtype = 'float32' ,
521- fill_value = self ._beta1 ,
522- shape = [1 ])
523- self ._add_accumulator (
524- name = self ._beta2_pow_acc_str ,
525- param = p ,
526- dtype = 'float32' ,
527- fill_value = self ._beta2 ,
528- shape = [1 ])
529537
530538 def _append_optimize_op (self , block , param_and_grad ):
531539 assert isinstance (block , framework .Block )
@@ -534,11 +542,6 @@ def _append_optimize_op(self, block, param_and_grad):
534542 param_and_grad [0 ])
535543 moment2 = self ._get_accumulator (self ._moment2_acc_str ,
536544 param_and_grad [0 ])
537- beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
538- param_and_grad [0 ])
539- beta2_pow_acc = self ._get_accumulator (self ._beta2_pow_acc_str ,
540- param_and_grad [0 ])
541-
542545 # create the adam optimize op
543546 adam_op = block .append_op (
544547 type = self .type ,
@@ -548,8 +551,8 @@ def _append_optimize_op(self, block, param_and_grad):
548551 "LearningRate" : self ._create_param_lr (param_and_grad ),
549552 "Moment1" : moment1 ,
550553 "Moment2" : moment2 ,
551- "Beta1Pow" : beta1_pow_acc ,
552- "Beta2Pow" : beta2_pow_acc
554+ "Beta1Pow" : self . _beta1_pow_acc ,
555+ "Beta2Pow" : self . _beta2_pow_acc
553556 },
554557 outputs = {
555558 "ParamOut" : param_and_grad [0 ],
@@ -564,30 +567,24 @@ def _append_optimize_op(self, block, param_and_grad):
564567
565568 return adam_op
566569
567- def _finish_update (self , block , param_and_grads ):
570+ def _finish_update (self , block ):
568571 """Update Beta1 and Beta2 Power accumulators
569572 """
570573 assert isinstance (block , framework .Block )
571574 main_block = block .program .global_block ()
572- for param , grad in param_and_grads :
573- if grad is None :
574- continue
575- with param .block .program .optimized_guard ([param , grad ]):
576- beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
577- param )
578- beta2_pow_acc = self ._get_accumulator (self ._beta2_pow_acc_str ,
579- param )
580- main_block .append_op (
581- type = "scale" ,
582- inputs = {"X" : beta1_pow_acc },
583- outputs = {"Out" : beta1_pow_acc },
584- attrs = {"scale" : self ._beta1 })
585-
586- main_block .append_op (
587- type = "scale" ,
588- inputs = {"X" : beta2_pow_acc },
589- outputs = {"Out" : beta2_pow_acc },
590- attrs = {"scale" : self ._beta2 })
575+ scale_beta1 = main_block .append_op (
576+ type = "scale" ,
577+ inputs = {"X" : self ._beta1_pow_acc },
578+ outputs = {"Out" : self ._beta1_pow_acc },
579+ attrs = {"scale" : self ._beta1 })
580+
581+ scale_beta2 = main_block .append_op (
582+ type = "scale" ,
583+ inputs = {"X" : self ._beta2_pow_acc },
584+ outputs = {"Out" : self ._beta2_pow_acc },
585+ attrs = {"scale" : self ._beta2 })
586+
587+ return [scale_beta1 , scale_beta2 ]
591588
592589
593590class AdamaxOptimizer (Optimizer ):
@@ -630,7 +627,6 @@ class AdamaxOptimizer(Optimizer):
630627 """
631628 _moment_acc_str = "moment"
632629 _inf_norm_acc_str = "inf_norm"
633- _beta1_pow_acc_str = "beta1_pow_acc"
634630
635631 def __init__ (self ,
636632 learning_rate = 0.001 ,
@@ -650,25 +646,28 @@ def __init__(self,
650646 self ._epsilon = epsilon
651647
652648 def _create_accumulators (self , block , parameters ):
649+ # Create beta1 power accumulator tensor
650+ beta_shape = [1 ]
651+ self ._beta1_pow_acc = self .helper .create_global_variable (
652+ name = unique_name .generate ('beta1_pow_acc' ),
653+ dtype = 'float32' if self ._dtype == None else self ._dtype ,
654+ shape = beta_shape ,
655+ lod_level = 0 ,
656+ persistable = True )
657+ self .helper .set_variable_initializer (
658+ self ._beta1_pow_acc , initializer = Constant (self ._beta1 ))
659+
653660 # Create accumulator tensors for first moment and infinity norm
654661 for p in parameters :
655662 self ._add_accumulator (self ._moment_acc_str , p )
656663 self ._add_accumulator (self ._inf_norm_acc_str , p )
657- self ._add_accumulator (
658- name = self ._beta1_pow_acc_str ,
659- param = p ,
660- dtype = 'float32' ,
661- fill_value = self ._beta1 ,
662- shape = [1 ])
663664
664665 def _append_optimize_op (self , block , param_and_grad ):
665666 assert isinstance (block , framework .Block )
666667
667668 moment = self ._get_accumulator (self ._moment_acc_str , param_and_grad [0 ])
668669 inf_norm = self ._get_accumulator (self ._inf_norm_acc_str ,
669670 param_and_grad [0 ])
670- beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
671- param_and_grad [0 ])
672671 # create the adamax optimize op
673672 adamax_op = block .append_op (
674673 type = self .type ,
@@ -678,7 +677,7 @@ def _append_optimize_op(self, block, param_and_grad):
678677 "LearningRate" : self ._create_param_lr (param_and_grad ),
679678 "Moment" : moment ,
680679 "InfNorm" : inf_norm ,
681- "Beta1Pow" : beta1_pow_acc
680+ "Beta1Pow" : self . _beta1_pow_acc
682681 },
683682 outputs = {
684683 "ParamOut" : param_and_grad [0 ],
@@ -693,22 +692,18 @@ def _append_optimize_op(self, block, param_and_grad):
693692
694693 return adamax_op
695694
696- def _finish_update (self , block , parameters_and_grads ):
695+ def _finish_update (self , block ):
697696 """Update Beta1 Power accumulator
698697 """
699698 assert isinstance (block , framework .Block )
700699 main_block = block .program .global_block ()
701- for param , grad in parameters_and_grads :
702- if grad is None :
703- continue
704- with param .block .program .optimized_guard ([param , grad ]):
705- beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
706- param )
707- main_block .append_op (
708- type = "scale" ,
709- inputs = {"X" : beta1_pow_acc },
710- outputs = {"Out" : beta1_pow_acc },
711- attrs = {"scale" : self ._beta1 })
700+ scale_beta1 = main_block .append_op (
701+ type = "scale" ,
702+ inputs = {"X" : self ._beta1_pow_acc },
703+ outputs = {"Out" : self ._beta1_pow_acc },
704+ attrs = {"scale" : self ._beta1 })
705+
706+ return [scale_beta1 ]
712707
713708
714709class DecayedAdagradOptimizer (Optimizer ):
@@ -1162,10 +1157,7 @@ def __init__(self,
11621157 self .params_grads .append ((param , grad ))
11631158
11641159 for param , grad in self .params_grads :
1165- if grad is None :
1166- continue
1167- with param .block .program .optimized_guard ([param , grad ]):
1168- self ._append_average_accumulate_op (param )
1160+ self ._append_average_accumulate_op (param )
11691161
11701162 self .apply_program = Program ()
11711163 block = self .apply_program .global_block ()
0 commit comments