From 6a0cb9818f4ef46539127b816d0badf13ca3dd51 Mon Sep 17 00:00:00 2001
From: Tim Steiner <tsteiner2@unl.edu>
Date: Fri, 29 Oct 2010 21:28:05 +0000
Subject: [PATCH] The unl migration tool will now ignore pages/files if they
 are duplicates of pages/files it has already seen.

git-svn-id: file:///tmp/wdn_thm_drupal/trunk@279 20a16fea-79d4-4915-8869-1ea9d5ebf173
---
 sites/all/modules/unl/unl_migration.php | 33 ++++++++++++++++---------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/sites/all/modules/unl/unl_migration.php b/sites/all/modules/unl/unl_migration.php
index aaf65715..abb76427 100644
--- a/sites/all/modules/unl/unl_migration.php
+++ b/sites/all/modules/unl/unl_migration.php
@@ -72,18 +72,19 @@ class Unl_Migration_Tool
 
     private $_curl;
 
-    private $_siteMap            = array();
-    private $_processedPages     = array();
-    private $_content            = array();
-    private $_createdContent     = array();
-    private $_lastModifications  = array();
-    private $_hrefTransform      = array();
-    private $_hrefTransformFiles = array();
-    private $_menu               = array();
-    private $_nodeMap            = array();
-    private $_pageTitles         = array();
-    private $_log                = array();
-    private $_blocks             = array();
+    private $_siteMap             = array();
+    private $_processedPages      = array();
+    private $_processedPageHashes = array();
+    private $_content             = array();
+    private $_createdContent      = array();
+    private $_lastModifications   = array();
+    private $_hrefTransform       = array();
+    private $_hrefTransformFiles  = array();
+    private $_menu                = array();
+    private $_nodeMap             = array();
+    private $_pageTitles          = array();
+    private $_log                 = array();
+    private $_blocks              = array();
     
     /**
      * Keep track of the state of the migration progress so that we can resume later
@@ -397,6 +398,14 @@ class Unl_Migration_Tool
             $this->_log('The file at ' . $fullPath . ' was empty! Ignoring.');
             return;
         }
+        
+        $pageHash = hash('md5', $data['content']);
+        if (($matchingPath = array_search($pageHash, $this->_processedPageHashes)) !== FALSE) {
+            $this->_log("The file found at $fullPath was a duplicate of the file at {$this->_baseUrl}$matchingPath ! Ignoring.");
+            return;
+        }
+        $this->_processedPageHashes[$path] = $pageHash; 
+        
         if ($data['lastModified']) {
             $this->_lastModifications[$path] = $data['lastModified'];
         }
-- 
GitLab