From 16feffb840076ed568489f57872a4241b98ac61d Mon Sep 17 00:00:00 2001
From: Tim Steiner <tsteiner2@unl.edu>
Date: Fri, 16 Jul 2010 22:32:00 +0000
Subject: [PATCH] If available, use ftp to grab everything from a site when
 migrating it instead of just crawling. Also clean up all of the debugging
 output so its a bit easier to turn into a log.

git-svn-id: file:///tmp/wdn_thm_drupal/branches/drupal-7.x@142 20a16fea-79d4-4915-8869-1ea9d5ebf173
---
 sites/all/modules/unl/unl_migration.php | 132 ++++++++++++++++++------
 1 file changed, 98 insertions(+), 34 deletions(-)

diff --git a/sites/all/modules/unl/unl_migration.php b/sites/all/modules/unl/unl_migration.php
index 36e7621b..aa1127ea 100644
--- a/sites/all/modules/unl/unl_migration.php
+++ b/sites/all/modules/unl/unl_migration.php
@@ -77,6 +77,7 @@ class Unl_Migration_Tool
     private $_menu;
     private $_nodeMap;
     private $_pageTitles;
+    private $_log;
     
     private function __construct($baseUrl, $frontierPath, $frontierUser, $frontierPass)
     {
@@ -97,10 +98,12 @@ class Unl_Migration_Tool
         $this->_menu = array();
         $this->_nodeMap = array();
         $this->_pageTitles = array();
+        $this->_log = array();
         
         $this->_baseUrl = $baseUrl;
         $this->_addSitePath('');
         $this->_curl = curl_init();
+        $this->_frontierScan();
     }
     
     private function _migrate()
@@ -112,12 +115,13 @@ class Unl_Migration_Tool
     	
     	// Process all of the pages on the site
         do {
+            set_time_limit(30);
+            
             $pagesToProcess = $this->_getPagesToProcess();
             foreach ($pagesToProcess as $pageToProcess) {
                 $this->_processPage($pageToProcess);
             }
             //if ($i++ == 2) break;
-            echo PHP_EOL . 'I = ' . $i++ . PHP_EOL;
         } while (count($pagesToProcess) > 0);
         
         // Fix any links to files that got moved to sites/<site>/files
@@ -134,7 +138,8 @@ class Unl_Migration_Tool
         
         // Update links and then create new page nodes.
         foreach ($this->_content as $path => $content) {
-            echo 'PATH: ' . $path . PHP_EOL;
+            set_time_limit(30);
+            
         	$hrefTransform = $this->_hrefTransform[$path];
         	
         	if (is_array($hrefTransform)) {
@@ -144,19 +149,16 @@ class Unl_Migration_Tool
             $this->_createPage($pageTitle, $content, $path, '' == $path);
         }
         
-        var_dump($this->_nodeMap);
-        
         $this->_createMenu();
         
+        print_r($this->_log);
         exit;
     }
     
     private function _addSitePath($path)
     {
     	if (($fragmentStart = strrpos($path, '#')) !== FALSE) {
-    		echo 'Changing ' . $path;
             $path = substr($path, 0, $fragmentStart);
-            echo ' to ' . $path . PHP_EOL;
     	}
         $this->_siteMap[hash('SHA256', $path)] = $path;
     }
@@ -198,7 +200,7 @@ class Unl_Migration_Tool
         		continue;
         	}
         	$primaryLinkNode = $primaryLinkLiNode->getElementsByTagName('a')->item(0);
-        	$menuItem = array('text' => $primaryLinkNode->textContent,
+        	$menuItem = array('text' => trim($primaryLinkNode->textContent),
         	                  'href' => $this->_makeLinkAbsolute($primaryLinkNode->getAttribute('href'), ''));
             
         	$childLinksUlNode = $primaryLinkLiNode->getElementsByTagName('ul')->item(0);
@@ -212,12 +214,16 @@ class Unl_Migration_Tool
         			continue;
         		}
         		$childLinkNode = $childLinkLiNode->getElementsByTagName('a')->item(0);
-	            $childMenu[] = array('text' => $childLinkNode->textContent,
+	            $childMenu[] = array('text' => trim($childLinkNode->textContent),
 	                                 'href' => $this->_makeLinkAbsolute($childLinkNode->getAttribute('href'), ''));
         	}
         	$menuItem['children'] = $childMenu;
             $this->_menu[] = $menuItem;
         }
+        
+        if (count($this->_menu) == 0) {
+            $this->_log('Could not find the navigation menu for your site!');
+        }
     }
 
     private function _createMenu()
@@ -228,6 +234,7 @@ class Unl_Migration_Tool
                 'expanded' => TRUE,
                 'menu_name' => 'main-menu',
                 'link_title' => $primaryMenu['text'],
+                'link_path' => '',
                 'weight' => $primaryWeights++
             );
             $href = $primaryMenu['href'];
@@ -244,24 +251,33 @@ class Unl_Migration_Tool
                 	$path = substr($path, 0, -1);
                 }
         		$nodeId = array_search($path, $this->_nodeMap, TRUE);
-        		$item['link_path'] = 'node/' . $nodeId;
-        		echo '[' . $nodeId . '] => ' . $path . PHP_EOL;  
+        		if ($nodeId) {
+        		    $item['link_path'] = 'node/' . $nodeId;
+        		}  
         	} else {
                 $item['link_path'] = $href;
         	}
-            menu_link_save($item);
-            print_r($item);
+        	
+        	if ($item['link_path']) {
+                menu_link_save($item);
+                $this->_log('Created menu item "' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.');
+        	} else {
+        	    $this->_log('Error: could not find a node to link to the ' . $item['link_title'] . ' menu item.');
+        	    continue;
+        	}
             
             if (!array_key_exists('children', $primaryMenu)) {
             	continue;
             }
             
             $plid = $item['mlid'];
+            $parentTitle = $item['link_title'];
             $childWeights = 1;
             foreach ($primaryMenu['children'] as $childMenu) {
 	            $item = array(
 	                'menu_name' => 'main-menu',
 	                'link_title' => $childMenu['text'],
+	                'link_path' => '',
 	                'plid' => $plid,
                     'weight' => $childWeights++
 	            );
@@ -279,13 +295,19 @@ class Unl_Migration_Tool
 	                    $path = substr($path, 0, -1);
 	                }
                     $nodeId = array_search($path, $this->_nodeMap, TRUE);
-                    $item['link_path'] = 'node/' . $nodeId;
-                    echo '[' . $nodeId . '] => ' . $path . PHP_EOL;
+                    if ($nodeId) {
+                        $item['link_path'] = 'node/' . $nodeId;
+                    }
 	            } else {
 	                $item['link_path'] = $href;
 	            }
-	            menu_link_save($item);
-                print_r($item);
+	            
+	            if ($item['link_path']) {
+	                menu_link_save($item);
+                    $this->_log('Created menu item "' . $parentTitle . ' / ' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.');
+	            } else {
+            	    $this->_log('Error: could not find a node to link to the "' . $parentTitle . ' / ' . $item['link_title'] . '" menu.');
+            	}
             }
         }
     }
@@ -293,6 +315,7 @@ class Unl_Migration_Tool
     private function _processPage($path)
     {
     	$this->_addProcessedPage($path);
+    	$fullPath = $this->_baseUrl . $path;
     	
         $url = $this->_baseUrl . $path;
         $startToken = '<!-- InstanceBeginEditable name="maincontentarea" -->';
@@ -301,6 +324,7 @@ class Unl_Migration_Tool
     
         $data = $this->_getUrl($url);
         if (!$data['content']) {
+            $this->_log('The file at ' . $fullPath . ' was empty! Ignoring.');
         	return;
         }
         if ($data['lastModified']) {
@@ -308,11 +332,11 @@ class Unl_Migration_Tool
         }
         if (strpos($data['contentType'], 'html') === FALSE) {
         	if (!$data['contentType']) {
+        	    $this->_log('The file type at ' . $fullPath . ' was not specified. Ignoring.');
         		return;
         	}
         	drupal_mkdir('public://' . dirname($path), NULL, TRUE);
         	$file = file_save_data($data['content'], 'public://' . $path, FILE_EXISTS_REPLACE);
-        	echo 'Uploaded file: ' . $path. PHP_EOL;
         	$this->_hrefTransformFiles[$path] = file_directory_path() . '/' . $path;
         	return;
         }
@@ -328,7 +352,8 @@ class Unl_Migration_Tool
         $maincontentarea = substr($html,
                                   $contentStart + strlen($startToken),
                                   $contentEnd - $contentStart - strlen($startToken));
-        if (!$maincontentarea || $contentStart === FALSE) {
+        if (!$maincontentarea || $contentStart === FALSE || $contentEnd === FALSE) {
+            $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Ignoring.');
             return;
         }
         $maincontentarea = trim($maincontentarea);
@@ -358,13 +383,13 @@ class Unl_Migration_Tool
         }
         
         if (!$pageTitle) {
+            $this->_log('No page title was found at ' . $fullPath . '.');
             $pageTitle = 'Untitled';
         }
         
-        echo 'Page Title: ' . $pageTitle . PHP_EOL;
-        
         $maincontentNode = $dom->getElementById('maincontent');
         if (!$maincontentNode) {
+            $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Ignoring.');
         	return;
         }
         
@@ -400,7 +425,6 @@ class Unl_Migration_Tool
     
     private function _makeLinkAbsolute($href, $path)
     {
-    	echo $href . ' => ';
         if (substr($path, -1) == '/') {
             $intermediatePath = $path;
         } else {
@@ -414,6 +438,9 @@ class Unl_Migration_Tool
         }
         
         $parts = parse_url($href);
+        if ($parts['scheme'] == 'mailto') {
+            return $href;
+        }
         if ($parts['scheme']) {
             $absoluteUrl = $href;
         } else if (substr($parts['path'], 0, 1) == '/') {
@@ -438,17 +465,14 @@ class Unl_Migration_Tool
         $absoluteUrl = $parts['scheme'] . '://' . $parts['host'] . $parts['path'];
         if ($parts['fragment']) {
             $absoluteUrl .= '#' . $parts['fragment'];
-        }            
-        echo $absoluteUrl . PHP_EOL;
+        }
         return $absoluteUrl;
     }
     
     private function _createPage($title, $content, $alias = '', $lastModified = NULL, $makeFrontPage = FALSE)
     {
-    	echo 'Alias: ' . PHP_EOL;
-        var_dump($alias);
         
-    	if (substr($alias, -1) == '/') {
+        if (substr($alias, -1) == '/') {
     		$alias = substr($alias, 0, -1);
     	}
     	
@@ -468,7 +492,12 @@ class Unl_Migration_Tool
         );
         
         node_submit($node);
-        node_save($node);
+        try {
+            node_save($node);
+        } catch (Exception $e) {
+            $this->_log('Error saving page at ' . $alias . '. This is probably a case sensitivity conflict.');
+            return;
+        }
         
         if ($this->_lastModifications[$alias]) {
             $mtime = $this->_lastModifications[$alias];
@@ -482,20 +511,22 @@ class Unl_Migration_Tool
 	            ->execute();
         }
         
-        var_dump($alias);
         $this->_nodeMap[$node->nid] = $alias;
         
         if ($makeFrontPage) {
         	variable_set('site_frontpage', 'node/' . $node->nid);
         }
+        
+        $this->_log('Created page "' . $title . '" with node id ' . $node->nid . ' at ' . $alias . '.');
     }
     
     private function _getUrl($url)
     {
+        $url = strtr($url, array(' ' => '%20'));
     	curl_setopt($this->_curl, CURLOPT_URL, $url);
     	curl_setopt($this->_curl, CURLOPT_RETURNTRANSFER, TRUE);
     	curl_setopt($this->_curl, CURLOPT_HEADER, TRUE);
-    	echo 'Retreiving ' . $url . PHP_EOL;
+    	
     	$data = curl_exec($this->_curl);
     	$meta = curl_getinfo($this->_curl);
     	
@@ -508,18 +539,19 @@ class Unl_Migration_Tool
         	$splitPos = strpos($rawHeader, ':');
         	$headerKey = substr($rawHeader, 0, $splitPos);
         	$headerValue = substr($rawHeader, $splitPos+1);
-        	$headers[$headerKey] = $headerValue;
+        	$headers[$headerKey] = trim($headerValue);
         }
     	
         $content = substr($data, $meta['header_size']);
         
     	if ($meta['http_code'] == 301) {
-    		preg_match('/Location: (.*)/', $content, $matches);
-    		$location = $matches[1];
+    		$location = $headers['Location'];
     		$path = substr($location, strlen($this->_baseUrl));
-    		$this->_addSitePath($path); 
+    		$this->_addSitePath($path);
+    		$this->_log('Found a redirect from ' . $url . ' to ' . $location . '. Some links may need to be updated.');
             return FALSE;
     	} else if ($meta['http_code'] != 200) {
+    	    $this->_log('Error: HTTP ' . $meta['http_code'] . ' while fetching ' . $url . '. Possible dead link.');
     		return FALSE;
     	}
     	
@@ -550,7 +582,7 @@ class Unl_Migration_Tool
     	if (substr($ftpPath, -1) == '/') {
     		$ftpPath .= 'index.shtml';
     	}
-    	echo $ftpPath . PHP_EOL;
+    	
         $files = ftp_rawlist($this->_frontier, $ftpPath);
         $mtime = substr($files[0], 43, 12);
         $mtime = strtotime($mtime);
@@ -569,9 +601,41 @@ class Unl_Migration_Tool
 	        $login = ftp_login($this->_frontier, $this->_frontierUser, $this->_frontierPass);
 	        if (!$login) {
 	        	$this->_frontier = NULL;
+	        	$this->_log('Error: could not connect to frontier with user ' . $this->_frontierUser . '.');
 	        }
     	}
     	return $this->_frontier;
     }
+    
+    private function _frontierScan($path)
+    {
+        if (!$this->_frontierConnect()) {
+            return;
+        }
+        
+        $ftpPath = $this->_frontierPath . $path;
+        $rawFileList = ftp_rawlist($this->_frontier, $ftpPath);
+        $fileList = ftp_nlist($this->_frontier, $ftpPath);
+        $files = array();
+        foreach ($rawFileList as $index => $rawListing) {
+            $file = substr($fileList[$index], strlen($ftpPath));
+            if (substr($rawListing, 0, 1) == 'd') {
+                //file is a directory
+                $this->_frontierScan($path . $file . '/');
+            } else {
+                $files[] = $file;
+                if ($file == 'index.shtml') {
+                    $this->_addSitePath($path);
+                } else {
+                    $this->_addSitePath($path . $file);
+                }
+            }
+        }
+    }
+    
+    private function _log($message)
+    {
+        $this->_log[] = $message;
+    }
 }
 
-- 
GitLab