Only allow schema evolution for case where new field is target of assignment where value is same name in source

szehon-ho · szehon-ho · commit abbeb1edcd95 · 2025-11-06T15:02:07.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1669,7 +1669,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
 
       case u: UpdateTable => resolveReferencesInUpdate(u)
 
-      case m @ MergeIntoTable(targetTable, sourceTable, _, _, _, _, _, _)
+      case m @ MergeIntoTable(targetTable, sourceTable, _, _, _, _, _)
         if !m.resolved && targetTable.resolved && sourceTable.resolved && !m.needSchemaEvolution =>
 
         EliminateSubqueryAliases(targetTable) match {
@@ -1749,8 +1749,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
             m.copy(mergeCondition = resolvedMergeCondition,
               matchedActions = newMatchedActions,
               notMatchedActions = newNotMatchedActions,
-              notMatchedBySourceActions = newNotMatchedBySourceActions,
-              originalSourceActions = newMatchedActions ++ newNotMatchedActions)
+              notMatchedBySourceActions = newNotMatchedBySourceActions)
         }
 
       // UnresolvedHaving can host grouping expressions and aggregate functions. We should resolve
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveMergeIntoSchemaEvolution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveMergeIntoSchemaEvolution.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 object ResolveMergeIntoSchemaEvolution extends Rule[LogicalPlan] {
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-    case m @ MergeIntoTable(_, _, _, _, _, _, _, _)
+    case m @ MergeIntoTable(_, _, _, _, _, _, _)
       if m.needSchemaEvolution =>
         val newTarget = m.targetTable.transform {
           case r : DataSourceV2Relation => performSchemaEvolution(r, m)
@@ -46,7 +46,7 @@ object ResolveMergeIntoSchemaEvolution extends Rule[LogicalPlan] {
     : DataSourceV2Relation = {
     (relation.catalog, relation.identifier) match {
       case (Some(c: TableCatalog), Some(i)) =>
-        val referencedSourceSchema = MergeIntoTable.referencedSourceSchema(m)
+        val referencedSourceSchema = MergeIntoTable.sourceSchemaForSchemaEvolution(m)
 
         val changes = MergeIntoTable.schemaChanges(relation.schema, referencedSourceSchema)
         c.alterTable(i, changes: _*)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala
@@ -45,7 +45,7 @@ object RewriteMergeIntoTable extends RewriteRowLevelCommand with PredicateHelper
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
     case m @ MergeIntoTable(aliasedTable, source, cond, matchedActions, notMatchedActions,
-      notMatchedBySourceActions, _, _) if m.resolved && m.rewritable && m.aligned &&
+      notMatchedBySourceActions, _) if m.resolved && m.rewritable && m.aligned &&
         !m.needSchemaEvolution && matchedActions.isEmpty && notMatchedActions.size == 1 &&
         notMatchedBySourceActions.isEmpty =>
 
@@ -79,7 +79,7 @@ object RewriteMergeIntoTable extends RewriteRowLevelCommand with PredicateHelper
       }
 
     case m @ MergeIntoTable(aliasedTable, source, cond, matchedActions, notMatchedActions,
-        notMatchedBySourceActions, _, _)
+        notMatchedBySourceActions, _)
       if m.resolved && m.rewritable && m.aligned && !m.needSchemaEvolution &&
         matchedActions.isEmpty && notMatchedBySourceActions.isEmpty =>
 
@@ -121,7 +121,7 @@ object RewriteMergeIntoTable extends RewriteRowLevelCommand with PredicateHelper
       }
 
     case m @ MergeIntoTable(aliasedTable, source, cond, matchedActions, notMatchedActions,
-        notMatchedBySourceActions, _, _)
+        notMatchedBySourceActions, _)
       if m.resolved && m.rewritable && m.aligned && !m.needSchemaEvolution =>
 
       EliminateSubqueryAliases(aliasedTable) match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala
@@ -860,9 +860,7 @@ case class MergeIntoTable(
     matchedActions: Seq[MergeAction],
     notMatchedActions: Seq[MergeAction],
     notMatchedBySourceActions: Seq[MergeAction],
-    withSchemaEvolution: Boolean,
-    // Preserves original pre-aligned actions for source matches
-    originalSourceActions: Seq[MergeAction])
+    withSchemaEvolution: Boolean)
   extends BinaryCommand with SupportsSubquery {
 
   lazy val aligned: Boolean = {
@@ -895,14 +893,12 @@ case class MergeIntoTable(
     case _ => false
   }
 
-  // a pruned version of source schema that only contains columns/nested fields
-  // explicitly assigned by MERGE INTO actions
-  private lazy val referencedSourceSchema: StructType =
-    MergeIntoTable.referencedSourceSchema(this)
+  private lazy val sourceSchemaForEvolution: StructType =
+    MergeIntoTable.sourceSchemaForSchemaEvolution(this)
 
   lazy val needSchemaEvolution: Boolean = {
     schemaEvolutionEnabled &&
-      MergeIntoTable.schemaChanges(targetTable.schema, referencedSourceSchema).nonEmpty
+      MergeIntoTable.schemaChanges(targetTable.schema, sourceSchemaForEvolution).nonEmpty
   }
 
   private def schemaEvolutionEnabled: Boolean = withSchemaEvolution && {
@@ -921,25 +917,6 @@ case class MergeIntoTable(
 
 object MergeIntoTable {
 
-  def apply(
-      targetTable: LogicalPlan,
-      sourceTable: LogicalPlan,
-      mergeCondition: Expression,
-      matchedActions: Seq[MergeAction],
-      notMatchedActions: Seq[MergeAction],
-      notMatchedBySourceActions: Seq[MergeAction],
-      withSchemaEvolution: Boolean): MergeIntoTable = {
-    MergeIntoTable(
-      targetTable,
-      sourceTable,
-      mergeCondition,
-      matchedActions,
-      notMatchedActions,
-      notMatchedBySourceActions,
-      withSchemaEvolution,
-      matchedActions ++ notMatchedActions)
-  }
-
   def getWritePrivileges(
       matchedActions: Iterable[MergeAction],
       notMatchedActions: Iterable[MergeAction],
@@ -1020,16 +997,18 @@ object MergeIntoTable {
     }
   }
 
-  // Filter the source schema to retain only fields that are referenced
-  // by at least one merge action
-  def referencedSourceSchema(merge: MergeIntoTable): StructType = {
+  // A pruned version of source schema that only contains columns/nested fields
+  // explicitly and directly assigned to a target counterpart in MERGE INTO actions.
+  // New columns/nested fields not existing in target will be added for schema evolution.
+  def sourceSchemaForSchemaEvolution(merge: MergeIntoTable): StructType = {
 
-    val assignments = merge.originalSourceActions.collect {
-      case a: UpdateAction => a.assignments.map(_.key)
-      case a: InsertAction => a.assignments.map(_.key)
+    val actions = merge.matchedActions ++ merge.notMatchedActions
+    val assignments = actions.collect {
+      case a: UpdateAction => a.assignments
+      case a: InsertAction => a.assignments
     }.flatten
 
-    val containsStarAction = merge.originalSourceActions.exists {
+    val containsStarAction = actions.exists {
       case _: UpdateStarAction => true
       case _: InsertStarAction => true
       case _ => false
@@ -1046,7 +1025,7 @@ object MergeIntoTable {
           // If this is a struct and one of the children is being assigned to in a merge clause,
           // keep it and continue filtering children.
           case struct: StructType if assignments.exists(assign =>
-            isPrefix(fieldPath, extractFieldPath(assign))) =>
+            isPrefix(fieldPath, extractFieldPath(assign.key))) =>
             Some(field.copy(dataType = filterSchema(struct, fieldPath)))
           // The field isn't assigned to directly or indirectly (i.e. its children) in any non-*
           // clause. Check if it should be kept with any * action.
@@ -1058,8 +1037,7 @@ object MergeIntoTable {
         }
       })
 
-    val res = filterSchema(merge.sourceTable.schema, Seq.empty)
-    res
+    filterSchema(merge.sourceTable.schema, Seq.empty)
   }
 
   // Helper method to extract field path from an Expression.
@@ -1071,18 +1049,28 @@ object MergeIntoTable {
     case _ => Seq.empty
   }
 
-  // Helper method to check if a given field path is a prefix of another path. Delegates
-  // equality to conf.resolver to correctly handle case sensitivity.
+  // Helper method to check if a given field path is a prefix of another path.
   private def isPrefix(prefix: Seq[String], path: Seq[String]): Boolean =
     prefix.length <= path.length && prefix.zip(path).forall {
       case (prefixNamePart, pathNamePart) =>
         SQLConf.get.resolver(prefixNamePart, pathNamePart)
     }
 
-  // Helper method to check if an assignment Expression's field path is equal to a path.
-  def isEqual(assignmentExpr: Expression, path: Seq[String]): Boolean = {
-    val exprPath = extractFieldPath(assignmentExpr)
-    exprPath.length == path.length && isPrefix(exprPath, path)
+  // Helper method to check if a given field path is a suffix of another path.
+  private def isSuffix(prefix: Seq[String], path: Seq[String]): Boolean =
+    prefix.length <= path.length && prefix.reverse.zip(path.reverse).forall {
+      case (prefixNamePart, pathNamePart) =>
+        SQLConf.get.resolver(prefixNamePart, pathNamePart)
+    }
+
+  // Helper method to check if an assignment key is equal to a source column
+  // and if the assignment value is the corresponding source column directly
+  private def isEqual(assignment: Assignment, path: Seq[String]): Boolean = {
+    val assignmenKeyExpr = extractFieldPath(assignment.key)
+    val assignmentValueExpr = extractFieldPath(assignment.value)
+    // Valid assignments are: col = s.col or col.nestedField = s.col.nestedField
+    assignmenKeyExpr.length == path.length && isPrefix(assignmenKeyExpr, path) &&
+      isSuffix(path, assignmentValueExpr)
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala
@@ -167,7 +167,7 @@ class PullupCorrelatedPredicatesSuite extends PlanTest {
     assert(optimized.resolved)
 
     optimized match {
-      case MergeIntoTable(_, _, s: InSubquery, _, _, _, _, _) =>
+      case MergeIntoTable(_, _, s: InSubquery, _, _, _, _) =>
         val outerRefs = SubExprUtils.getOuterReferences(s.query.plan)
         assert(outerRefs.isEmpty, "should be no outer refs")
       case other =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala