Avoid duplicates with > 1000 changelog entries.

toddbc · toddbc · commit a1f14a1209f5 · 2017-10-02T15:31:44.000-07:00
If an import modifies 2000 products, and then changes their category
allocation, etc. there may be many duplicates, but not within a range
of 1000 changelog entries.

This loads all the ids in one batch (which should be relatively cheap
memory and time wise), and then runs over them with the indexer in
smaller chunks.  This way indexers continue to seem the small chunk
sizes, in case they would fail with too many ids.
diff --git a/lib/internal/Magento/Framework/Mview/View.php b/lib/internal/Magento/Framework/Mview/View.php
@@ -21,6 +21,11 @@ class View extends \Magento\Framework\DataObject implements ViewInterface
      */
     const DEFAULT_BATCH_SIZE = 1000;
 
+    /**
+     * Max versions to load from database at a time
+     */
+    const MAX_VERSION_QUERY_BATCH = 100000;
+
     /**
      * @var string
      */
@@ -272,14 +277,19 @@ public function update()
             try {
                 $this->getState()->setStatus(View\StateInterface::STATUS_WORKING)->save();
 
+                $versionBatchSize = self::MAX_VERSION_QUERY_BATCH;
                 $batchSize = isset($this->changelogBatchSize[$this->getChangelog()->getViewId()])
                     ? $this->changelogBatchSize[$this->getChangelog()->getViewId()]
                     : self::DEFAULT_BATCH_SIZE;
 
-                for ($versionFrom = $lastVersionId; $versionFrom < $currentVersionId; $versionFrom += $batchSize) {
-                    $ids = $this->getChangelog()->getList($versionFrom, $versionFrom + $batchSize);
+                for ($versionFrom = $lastVersionId; $versionFrom < $currentVersionId; $versionFrom += $versionBatchSize) {
+                    // Don't go past the current version for atomicy.
+                    $versionTo = min($currentVersionId, $versionFrom + $versionBatchSize);
+                    $ids = $this->getChangelog()->getList($versionFrom, $versionTo);
 
-                    if (!empty($ids)) {
+                    // We run the actual indexer in batches.  Chunked AFTER loading to avoid duplicates in separate chunks.
+                    $chunks = array_chunk($ids, $batchSize);
+                    foreach ($chunks as $ids) {
                         $action->execute($ids);
                     }
                 }