Skip to content
Snippets Groups Projects
Forked from UNL Information Services / UNL-CMS
562 commits behind the upstream repository.
unl_migration.php 44.91 KiB
<?php

function unl_migration($form, &$form_state)
{
    if ($form_state['rebuild']) {
        $form['root'] = array(
            '#type' => 'fieldset',
            '#title' => 'This is taking a while.  Click continue.'
        );
        $form['root']['submit'] = array(
            '#type' => 'submit',
            '#value' => 'Continue',
        );
        return $form;
    } 
    
    
    $form['root'] = array(
        '#type' => 'fieldset',
        '#title' => 'Migration Tool',
    );
    
    $form['root']['site_url'] = array(
        '#type' => 'textfield',
        '#title' => t('Site URL'),
        '#description' => t('Full URL to the existing site you wish to migrate'),
        '#required' => TRUE
    );
    
    $form['root']['frontier_path'] = array(
        '#type' => 'textfield',
        '#title' => t('Frontier FTP Path'),
        '#description' => t('Full path to the root of your site on frontier (if applicable).'),
        '#required' => FALSE
    );
    $form['root']['frontier_user'] = array(
        '#type' => 'textfield',
        '#title' => t('Frontier FTP Username'),
        '#required' => FALSE
    );
    $form['root']['frontier_pass'] = array(
        '#type' => 'password',
        '#title' => t('Frontier FTP Password'),
        '#required' => FALSE
    );
    $form['root']['ignore_duplicates'] = array(
        '#type' => 'checkbox',
        '#title' => t('Ignore Duplicate Pages/Files'),
        '#description' => t("This may be needed if your site has an unlimited number of dynamicly generated paths."),
    );
    
    $form['submit'] = array(
        '#type' => 'submit',
        '#value' => 'Migrate'
    );
    
    return $form;
}

function unl_migration_submit($form, &$form_state) {
  
  $migration = new Unl_Migration_Tool(
    $form_state['values']['site_url'],
    $form_state['values']['frontier_path'],
    $form_state['values']['frontier_user'],
    $form_state['values']['frontier_pass'],
    $form_state['values']['ignore_duplicates']
  );
  
  $operations = array(
    array(
      'unl_migration_step',
      array(
        $migration,
      )
    )
  );
  
  $batch = array(
  	'operations' => $operations,
  	'file' => substr(__FILE__, strlen(DRUPAL_ROOT) + 1),
  );
  batch_set($batch);
}

function unl_migration_step($migration, &$context)
{
  $finished = 0;
  if (isset($context['sandbox']['file']) && file_exists($context['sandbox']['file'])) {
    $migration = Unl_Migration_Tool::load_from_disk($context['sandbox']['file']);
    $finished = $context['sandbox']['finished'];
  }
  
  if ($migration->migrate()) {
    $context['finished'] = 1;
    return;
  }
  
  $finished += 0.01;
  if ($finished > 0.99) {
    $finished = 0.99;
  }
  $context['finished'] = $finished;
  $context['sandbox']['finished'] = $finished;
  $context['sandbox']['file'] = Unl_Migration_Tool::save_to_disk($migration);
}

function unl_migration_queue_step($migration_storage_file) {
  $migration = Unl_Migration_Tool::load_from_disk($migration_storage_file);
  if ($migration->migrate(30)) {
    return TRUE;
  }
  DrupalQueue::get('unl_migration', TRUE)
    ->createItem(Unl_Migration_Tool::save_to_disk($migration));
  return FALSE;
}


class Unl_Migration_Tool
{
    /**
     * base url to the site to migrate, eg: http://www.unl.edu/band/
     *
     * @var string
     */
    private $_baseUrl;

    /**
     * base path to frontier dir, eg: /cwis/data/band
     *
     * @var string
     */
    private $_frontierPath;
    private $_frontierUser;
    private $_frontierPass;
    private $_frontier;

    private $_curl;

    private $_siteMap             = array();
    private $_processedPages      = array();
    private $_processedPageHashes = array();
    private $_content             = array();
    private $_createdContent      = array();
    private $_lastModifications   = array();
    private $_redirects           = array();
    private $_hrefTransform       = array();
    private $_hrefTransformFiles  = array();
    private $_menu                = array();
    private $_breadcrumbs         = array();
    private $_nodeMap             = array();
    private $_pageTitles          = array();
    private $_pageParentLinks     = array();
    private $_log                 = array();
    private $_blocks              = array();
    private $_isFrontier          = FALSE;
    private $_frontierIndexFiles  = array('low_bandwidth.shtml', 'index.shtml', 'index.html', 'index.htm', 'default.shtml');
    private $_frontierFilesScanned = array();
    private $_ignoreDuplicates    = FALSE;
    
    /**
     * Keep track of the state of the migration progress so that we can resume later
     * @var int
     */
    public $_state = self::STATE_NONE;
    const STATE_NONE              = 1;
    const STATE_PROCESSING_BLOCKS = 2;
    const STATE_PROCESSING_PAGES  = 3;
    const STATE_CREATING_NODES    = 4;
    const STATE_DONE              = 5;
    
    private $_start_time;
    
    public function __construct($baseUrl, $frontierPath, $frontierUser, $frontierPass, $ignoreDuplicates)
    {
        header('Content-type: text/plain');

        // Check to see if we're migrating from frontier so we can make some extra assumptions.
        $baseUrlParts = parse_url($baseUrl);
        $remoteHostname = @gethostbyaddr(gethostbyname($baseUrlParts['host']));
        if ($remoteHostname == 'frontier.unl.edu') {
            $this->_isFrontier = TRUE;
        }
        
        // Add trailing slash if necessary
        $baseUrl = trim($baseUrl);
        if (substr($baseUrl, -1) != '/') {
            $baseUrl .= '/';
        }
        
        $frontierPath = trim ($frontierPath);
        if ($frontierPath && substr($frontierPath, -1) != '/') {
          $frontierPath .= '/';
        }

        $this->_frontierPath = $frontierPath;
        $this->_frontierUser = $frontierUser;
        $this->_frontierPass = $frontierPass;
        
        $this->_ignoreDuplicates = (bool) $ignoreDuplicates;
        
        $this->_baseUrl = $baseUrl;
        $this->_addSitePath('');
    }
    
    public function migrate($time_limit = 5)
    {
        if (!$this->_sanity_check()) {
            return TRUE;
        }
      
        $this->_start_time = time();
        ini_set('memory_limit', -1);
        
        if ($this->_state == self::STATE_NONE) {
            if (!$this->_frontierScan('', $time_limit)) {
                return FALSE;
            }
            
            $this->_state = self::STATE_PROCESSING_BLOCKS;
            if (time() - $this->_start_time > $time_limit) {
                return FALSE;
            }
        }
        
        if ($this->_state == self::STATE_PROCESSING_BLOCKS) {
            // Parse the menu
            $this->_processMenu();
            $this->_process_blocks();
            $this->_process_breadcrumbs();
            $this->_state = self::STATE_PROCESSING_PAGES;
        }
        
        if ($this->_state == self::STATE_PROCESSING_PAGES) {
            // Process all of the pages on the site (Takes a while)
            do {
                set_time_limit(30);
                
                $pagesToProcess = $this->_getPagesToProcess();
                foreach ($pagesToProcess as $pageToProcess) {
                    if (time() - $this->_start_time > $time_limit) {
                        return FALSE;
                    }
                    $this->_processPage($pageToProcess);
                }
            } while (count($pagesToProcess) > 0);
         
        
            // Fix any links to files that got moved to sites/<site>/files
            foreach ($this->_hrefTransform as $path => &$transforms) {
                if (array_key_exists('', $transforms)) {
                    unset($transforms['']);
                }
                foreach ($transforms as $oldPath => &$newPath) {
                    if (array_key_exists($newPath, $this->_redirects)) {
                        $newPath = $this->_redirects[$newPath];
                    }
                    if (array_key_exists($newPath, $this->_hrefTransformFiles)) {
                        $newPath = $this->_hrefTransformFiles[$newPath];
                    }
                }
            }
           
            $this->_state = self::STATE_CREATING_NODES;
        }
        
        if ($this->_state == self::STATE_CREATING_NODES) {
            // Update links and then create new page nodes. (Takes a while)
            foreach ($this->_content as $path => $content) {
                if (in_array($path, $this->_createdContent, TRUE)) {
                    continue;
                }
                if (time() - $this->_start_time > $time_limit) {
                    return FALSE;
                }
                set_time_limit(30);
                
                $hrefTransform = isset($this->_hrefTransform[$path]) ? $this->_hrefTransform[$path] : array();
                $content = strtr($content, $hrefTransform);
                
                $pageTitle = $this->_pageTitles[$path];
                $this->_createPage($pageTitle, $content, $path, '' == $path);
                $this->_createdContent[] = $path;
            }
            
            $this->_createMenu();
            $this->_create_blocks();
            $this->_create_breadcrumbs();
            
            $this->_state = self::STATE_DONE;
        }
        
        return TRUE;
    }

    private function _sanity_check() {
      if (!$this->_getUrl($this->_baseUrl)) {
        form_set_error('unl', 'The specified site does not exist!');
        return FALSE;
      }
      return TRUE;
    }
    
    private function _addSitePath($path)
    {
        if (($fragmentStart = strrpos($path, '#')) !== FALSE) {
            $path = substr($path, 0, $fragmentStart);
        }
        $this->_siteMap[hash('SHA256', $path)] = $path;
    }
    
    private function _getPagesToProcess()
    {
        $pagesToProcess = array();
        foreach ($this->_siteMap as $path) {
            if (in_array($path, $this->_processedPages)) {
                continue;
            }
            $pagesToProcess[] = $path;
        }
        return $pagesToProcess;
    }
    
    private function _addProcessedPage($path)
    {
        $this->_processedPages[hash('SHA256', $path)] = $path;
    }
    
    private function _processMenu()
    {
        $content = $this->_getUrl($this->_baseUrl);
        $html = $content['content'];
        
        $dom = new DOMDocument();
        $dom->loadHTML($html);
        $navlinksNode = $dom->getElementById('navigation');
        if (!$navlinksNode) {
          return;
        }
    
        // Check to see if there's a base tag on this page.
        $base_tags = $dom->getElementsByTagName('base');
        $page_base = NULL;
        if ($base_tags->length > 0) {
          $page_base = $base_tags->item(0)->getAttribute('href');
        }
        
        $linkNodes = $navlinksNode->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
            $this->_processLinks($linkNode->getAttribute('href'), '', $page_base, '<menu>');
        }
        
        $navlinksUlNode = $navlinksNode->getElementsByTagName('ul')->item(0);
        foreach ($navlinksUlNode->childNodes as $primaryLinkLiNode) {
            if (strtolower($primaryLinkLiNode->nodeName) != 'li') {
                continue;
            }
            $primaryLinkNode = $primaryLinkLiNode->getElementsByTagName('a')->item(0);
            $menuItem = array('text' => trim($primaryLinkNode->textContent),
                              'href' => $this->_makeLinkAbsolute($primaryLinkNode->getAttribute('href'), ''));
            
            $childLinksUlNode = $primaryLinkLiNode->getElementsByTagName('ul')->item(0);
            if (!$childLinksUlNode) {
                $this->_menu[] = $menuItem;
                continue;
            }
            $childMenu = array();
            foreach ($childLinksUlNode->childNodes as $childLinkLiNode) {
                if (strtolower($childLinkLiNode->nodeName) != 'li') {
                    continue;
                }
                $childLinkNode = $childLinkLiNode->getElementsByTagName('a')->item(0);
                // If somebody left this menu item empty, skip it.  Liferay, I'm looking at you!
                if (!$childLinkNode || !$childLinkNode->hasAttribute('href')) {
                    continue;
                }
                $childMenu[] = array('text' => trim($childLinkNode->textContent),
                                     'href' => $this->_makeLinkAbsolute($childLinkNode->getAttribute('href'), ''));
            }
            $menuItem['children'] = $childMenu;
            $this->_menu[] = $menuItem;
        }
        
        if (count($this->_menu) == 0) {
            $this->_log('Could not find the navigation menu for your site!', WATCHDOG_ERROR);
        }
    }

    private function _createMenu()
    {
        // Start off by removing the "Home" menu link if it exists.
        $menu_links = menu_load_links('main-menu');
        foreach ($menu_links as $menu_link) {
          if ($menu_link['plid'] == 0 &&
              $menu_link['link_title'] == 'Home' &&
              $menu_link['link_path'] == '<front>') {
            menu_link_delete($menu_link['mlid']);
          }
        }
        
        // Now recursively create each menu.
        $primaryWeights = 1;
        foreach ($this->_menu as $primaryMenu) {
            $item = array(
                'expanded' => TRUE,
                'menu_name' => 'main-menu',
                'link_title' => $primaryMenu['text'],
                'link_path' => '',
                'weight' => $primaryWeights++
            );
            $href = $primaryMenu['href'];
            if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) {
                $path = substr($href, strlen($this->_baseUrl));
                if (!$path) {
                    $path = '';
                }
                if ($fragmentPos = strrpos($path, '#') !== FALSE) {
                    $item['options']['fragment'] = substr($path, $fragmentPos + 1);
                    $path = substr($path, 0, $fragmentPos);
                }
                if (substr($path, -1) == '/') {
                    $path = substr($path, 0, -1);
                }
                $nodeId = array_search($path, $this->_nodeMap, TRUE);
                if ($nodeId) {
                    $item['link_path'] = 'node/' . $nodeId;
                }  
            } else {
                $item['link_path'] = $href;
            }
            
            if ($item['link_path']) {
                menu_link_save($item);
                $this->_log('Created menu item "' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.');
            } else {
                $this->_log('Could not find a node to link to the ' . $item['link_title'] . ' menu item.', WATCHDOG_ERROR);
                continue;
            }
            
            if (!array_key_exists('children', $primaryMenu)) {
                continue;
            }
            
            $plid = $item['mlid'];
            $parentTitle = $item['link_title'];
            $childWeights = 1;
            foreach ($primaryMenu['children'] as $childMenu) {
                $item = array(
                    'menu_name' => 'main-menu',
                    'link_title' => $childMenu['text'],
                    'link_path' => '',
                    'plid' => $plid,
                    'weight' => $childWeights++
                );
                $href = $childMenu['href'];
                if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) {
                    $path = substr($href, strlen($this->_baseUrl));
                    if (!$path) {
                        $path = '';
                    }
                    if (($fragmentPos = strrpos($path, '#')) !== FALSE) {
                        $item['options']['fragment'] = substr($path, $fragmentPos + 1);
                        $path = substr($path, 0, $fragmentPos);
                    }
                    if (substr($path, -1) == '/') {
                        $path = substr($path, 0, -1);
                    }
                    $nodeId = array_search($path, $this->_nodeMap, TRUE);
                    if ($nodeId) {
                        $item['link_path'] = 'node/' . $nodeId;
                    }
                } else {
                    $item['link_path'] = $href;
                }
                
                if ($item['link_path']) {
                    menu_link_save($item);
                    $this->_log('Created menu item "' . $parentTitle . ' / ' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.');
                } else {
                    $this->_log('Could not find a node to link to the "' . $parentTitle . ' / ' . $item['link_title'] . '" menu.', WATCHDOG_ERROR);
                }
            }
        }
        
        
        // Now set up the site hierarchy
        $pageParentLinks = $this->_pageParentLinks;
        foreach ($this->_pageParentLinks as $path => $parentLink) {
          $this->_createParentLink($path, $parentLink);
        }
    }
    
    private function _createParentLink($childPath, $parentPath) {
      
      // If the child is the site root, just return the root mlid.
      if (!$childPath) {
        return 0;
      }
      
      // If the child link already exists, just return its mlid.
      $childLink = menu_link_get_preferred(drupal_get_normal_path(rtrim($childPath, '/')));
      if ($childLink && $childLink['link_path'] != 'node/%') {
        return $childLink['mlid'];
      }
      
      // Find the parent link, if it doesn't exist, recursively create it.
      $parentNodePath = drupal_get_normal_path(rtrim($parentPath, '/'));
      $parentLink = menu_link_get_preferred($parentNodePath);
      if ($parentLink) {
        $parentLinkId = $parentLink['mlid'];
      } else if (substr($parentNodePath, 0, 5) != 'node/') {
        // This will catch invalid breadcrumb links and change them to point to the site root.
        $parentLink = '';
        $parentLinkId = 0;
      } else {
        $parentLinkId = $this->_createParentLink($parentPath, $this->_pageParentLinks[$parentPath]);
      }
      
      // Create the menu item.
      $item = array(
        'menu_name' => 'main-menu',
        'link_title' => $this->_pageTitles[$childPath],
        'link_path' => drupal_get_normal_path(rtrim($childPath, '/')),
        'plid' => $parentLinkId,
        'weight' => 50,
        'hidden' => 1,
      );
      menu_link_save($item);
      
      // Return its mlid.
      return $item['mlid'];
    }
    
    private function _process_blocks() {
      $content = $this->_getUrl($this->_baseUrl);
      $html = $content['content'];
      
      $this->_blocks['related_links'] = $this->_get_instance_editable_content($html, 'leftcollinks');
      $this->_blocks['contact_info'] = $this->_get_instance_editable_content($html, 'contactinfo');
      $this->_blocks['optional_footer'] = $this->_get_instance_editable_content($html, 'optionalfooter');
      $this->_blocks['footer_content'] = $this->_get_instance_editable_content($html, 'footercontent');
      
      // Filter out the existing headers.
      $this->_blocks['related_links'] = preg_replace('/\s*<h3>\s*Related Links\s*<\/h3>\s*/', '', $this->_blocks['related_links']);
      $this->_blocks['contact_info'] = preg_replace('/\s*<h3>\sContacting Us*\s*<\/h3>\s*/', '', $this->_blocks['contact_info']);
      $this->_blocks['contact_info'] = preg_replace('/\s*<h3>\s*Contact Us\s*<\/h3>\s*/', '', $this->_blocks['contact_info']);
    }
    
    private function _create_blocks() {
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['contact_info'],
        ))
        ->condition('bid', 101)
        ->execute();
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['related_links'],
        ))
        ->condition('bid', 102)
        ->execute();
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['optional_footer'],
        ))
        ->condition('bid', 103)
        ->execute();
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['footer_content'],
        ))
        ->condition('bid', 104)
        ->execute();
    }
    
    private function _process_breadcrumbs() {
      $content = $this->_getUrl($this->_baseUrl);
      $html = $content['content'];
      
      $dom = new DOMDocument();
      $dom->loadHTML($html);
      $breadcrumbs_node = $dom->getElementById('breadcrumbs');
      if (!$breadcrumbs_node) {
        return;
      }
      
      $link_nodes = $breadcrumbs_node->getElementsByTagName('a');
      $list_nodes = $breadcrumbs_node->getElementsByTagName('li');
      $unlinked_node = FALSE;
      if ($list_nodes->length > $link_nodes->length) {
        $unlinked_node = TRUE;
      }
      
      // Scan each of the breadcrumb links, skipping the first and the last (but only if there's an un-linked "true" last breadcrumb)
      for ($i = 1; $i < $link_nodes->length - ($unlinked_node ? 1 : 0); $i++) {
        $link_node = $link_nodes->item($i);
        $this->_breadcrumbs[] = array(
          'text' => trim($link_node->textContent),
          'href' => $this->_makeLinkAbsolute($link_node->getAttribute('href', ''))
        );
      }
    }
    
    private function _create_breadcrumbs() {
      $current_settings = variable_get('theme_unl_wdn_settings', array());
      $current_settings['intermediate_breadcrumbs'] = $this->_breadcrumbs;
      variable_set('theme_unl_wdn_settings', $current_settings);
    }
    
    private function _processPage($path)
    {
        $this->_addProcessedPage($path);
        $fullPath = $this->_baseUrl . $path;
        
        $url = $this->_baseUrl . $path;
    
        $data = $this->_getUrl($url);
        if (!$data['content']) {
            $this->_log('The file at ' . $fullPath . ' was empty! Ignoring.', WATCHDOG_ERROR);
            return;
        }
        
        $pageHash = hash('md5', $data['content']);
        if (($matchingPath = array_search($pageHash, $this->_processedPageHashes)) !== FALSE) {
            $logMessage = "The file found at $fullPath was a duplicate of the file at {$this->_baseUrl}$matchingPath !";
            if ($this->_ignoreDuplicates) {
                $this->_log($logMessage . ' Ignoring.', WATCHDOG_WARNING);
                return;
            } else {
                $this->_log($logMessage, WATCHDOG_WARNING);
            }
        }
        $this->_processedPageHashes[$path] = $pageHash; 
        
        if (isset($data['lastModified'])) {
            $this->_lastModifications[$path] = $data['lastModified'];
        }
        if (strpos($data['contentType'], 'html') === FALSE) {
            if (!$data['contentType']) {
                $this->_log('The file type at ' . $fullPath . ' was not specified. Ignoring.', WATCHDOG_ERROR);
                return;
            }
            @drupal_mkdir('public://' . urldecode(dirname($path)), NULL, TRUE);
            if (!mb_check_encoding($path, 'UTF-8')) {
                $path = iconv('ISO-8859-1', 'UTF-8', $path); 
            }
            try {
              $file = file_save_data($data['content'], 'public://' . urldecode($path), FILE_EXISTS_REPLACE);
            } catch (Exception $e) {
              $this->_log('Could not migrate file "' . $path . '"! File name too long?', WATCHDOG_ERROR);
            }
            $this->_hrefTransformFiles[$path] = $this->_makeRelativeUrl(file_create_url('public://' . $path));
            return;
        }
        $html = $data['content'];
        
        $maincontentarea = $this->_get_instance_editable_content($html, 'maincontentarea');
        if (!$maincontentarea) {
            $maincontentarea = $this->_get_old_main_content_area($html);
        }
    
        if (!$maincontentarea) {
            $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Using entire body.', WATCHDOG_WARNING);
            $maincontentarea = $this->_get_text_between_tokens($html, '<body>', '</body>');
        }
    
        if (!$maincontentarea) {
            // its possible the body tag has attributes.  Check for this and filter them out.
            $maincontentarea = $this->_get_text_between_tokens($html, '<body', '</body>', FALSE);
            // As long as we find a closing bracket before the next opening bracket, its probably safe to assume the body tag is intact. 
            if (strpos($maincontentarea, '>') < strpos($maincontentarea, '<')) {
              $maincontentarea = trim(substr($maincontentarea, strpos($maincontentarea, '>') + 1));
              // Tidy the output here, otherwise tidy would see HTML starting in the middle of a <body key="val"> tag.
              $maincontentarea = $this->_tidy_html_fragment($maincontentarea);
            // Otherwise, ignore it all. (Will be an issue if the body has no other tags, but how likely is this?)
            } else {
              $maincontentarea = '';
            }
        }
        
        if (!$maincontentarea) {
            $this->_log('The file at ' . $fullPath . ' has no valid body. Ignoring.', WATCHDOG_ERROR);
            return;
        }
        
        $dom = new DOMDocument();
        @$dom->loadHTML($html);
        
        // Check to see if there's a base tag on this page.
        $base_tags = $dom->getElementsByTagName('base');
        $page_base = NULL;
        if ($base_tags->length > 0) {
          $page_base = $base_tags->item(0)->getAttribute('href');
        }
        
        $pageTitle = '';
        $pageTitleNode = $dom->getElementById('pagetitle');
        if ($pageTitleNode) {
            $pageTitleH2Nodes = $pageTitleNode->getElementsByTagName('h2');
            if ($pageTitleH2Nodes->length > 0) {
                $pageTitle = $pageTitleH2Nodes->item(0)->textContent;
            }
        }
        
        // If there is no WDN compliant title, search for others
        if (!$pageTitle) {
          // First, check for a WDN compliant <title>
          $titleText = '';
          $titleNodes = $dom->getElementsByTagName('title');
          if ($titleNodes->length > 0) {
            $titleText = $titleNodes->item(0)->textContent; 
          }
          $titleParts = explode('|', $titleText);
          if (count($titleParts) > 2) {
            $pageTitle = trim(array_pop($titleParts));
          }
          // Finally, combine what title does exist with the last part of the path
          else {
            $filename = trim($path, '/');
            $filename = explode('/', $filename);
            $filename = array_pop($filename);
            // Strip off a file extension if it exists.
            if (strrpos($filename, '.') !== FALSE) {
              $filename = substr($filename, 0, strrpos($filename, '.'));
            }
            $pageTitle = "$titleText ($filename)";
          }
        }
        
        if (!$pageTitle) {
            $this->_log('No page title was found at ' . $fullPath . '.', WATCHDOG_ERROR);
            $pageTitle = 'Untitled';
        }
        
        $maincontentNode = $dom->getElementById('maincontent');
        if (!$maincontentNode) {
            $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Using entire body.', WATCHDOG_WARNING);
            $bodyNodes = $dom->getElementsByTagName('body');
            if ($bodyNodes->length == 0) {
                $this->_log('The file at ' . $fullPath . ' has no valid body. Ignoring.', WATCHDOG_ERROR);
                return;
            }
            $maincontentNode = $bodyNodes->item(0);
        }
        
        $linkNodes = $maincontentNode->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
            $this->_processLinks($linkNode->getAttribute('href'), $path, $page_base);
        }
    
        $linkNodes = $maincontentNode->getElementsByTagName('img');
        foreach ($linkNodes as $linkNode) {
            $this->_processLinks($linkNode->getAttribute('src'), $path, $page_base);
        }
        
        $this->_content[$path] = $maincontentarea;
        $this->_pageTitles[$path] = $pageTitle;
        
        // Scan the page for the parent breadcrumb
        $breadcrumbs = $dom->getElementById('breadcrumbs');
        if ($breadcrumbs) {
          $breadcrumbs = $breadcrumbs->getElementsByTagName('a');
          $breadcrumb = $breadcrumbs->item($breadcrumbs->length - 1);
          if ($breadcrumb) {
            $breadcrumb = $breadcrumb->getAttribute('href');
            $breadcrumb = $this->_makeLinkAbsolute($breadcrumb, $path);
            if (substr($breadcrumb, 0, strlen($this->_baseUrl)) == $this->_baseUrl && $breadcrumb != $this->_baseUrl) {
              $pageParentLink = substr($breadcrumb, strlen($this->_baseUrl));
            } else {
              $pageParentLink = '';
            }
            if ($pageParentLink == $path) {
              $pageParentLink = '';
            }
            $this->_pageParentLinks[$path] = $pageParentLink;
          }
        }
    }
    
    private function _processLinks($originalHref, $path, $page_base = NULL, $tag = NULL)
    {
        if (substr($originalHref, 0, 1) == '#') {
            return;
        }
        
        if (!$page_base) {
          $page_base = $path;
        }
        
        $href = $this->_makeLinkAbsolute($originalHref, $page_base);
        
        if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) {
            $newPath = substr($href, strlen($this->_baseUrl));
            if ($newPath === FALSE) {
                $newPath = '';
            }
            if ($tag) {
                $this->_hrefTransform[$tag][$originalHref] = $newPath;
            } else {
                $this->_hrefTransform[$path][$originalHref] = $newPath;
            }
            $this->_addSitePath($newPath);
        }
    }
    
    private function _makeLinkAbsolute($href, $path)
    {
        $path_parts = parse_url($path);
        
        if (isset($path_parts['scheme'])) {
            $base_url = $path;
            $path = '';
        } else {
            $base_url = $this->_baseUrl;
        }
        
        if (substr($path, -1) == '/') {
            $intermediatePath = $path;
        } else {
            $intermediatePath = dirname($path);
        }
        if ($intermediatePath == '.') {
            $intermediatePath = '';
        }
        if (strlen($intermediatePath) > 0 && substr($intermediatePath, -1) != '/') {
            $intermediatePath .= '/';
        }
        
        $parts = parse_url($href);
        if (isset($parts['scheme']) && !in_array($parts['scheme'], array('http', 'https'))) {
            return $href;
        }
        if (isset($parts['scheme'])) {
            $absoluteUrl = $href;
        } else if (isset($parts['path']) && substr($parts['path'], 0, 1) == '/') {
            $baseParts = parse_url($this->_baseUrl);
            $absoluteUrl = $baseParts['scheme'] . '://' . $baseParts['host'] . $parts['path'];
            if (isset($parts['fragment'])) {
                $absoluteUrl .= '#' . $parts['fragment'];
            }
        } else if (substr($href, 0, 1) == '#') {
            $absoluteUrl = $this->_baseUrl . $path . $href;
        } else {
            $absoluteUrl = $this->_baseUrl . $intermediatePath . $href;
        }
        $parts = parse_url($absoluteUrl);
        
     /*   $this->_log('Absolute URL ' . $absoluteUrl . ' converted to parts:'
            .' scheme:' . $parts['scheme']
            .' host:' . $parts['host']
            .' port:' . $parts['port']
            .' user:' . $parts['user']
            .' pass:' . $parts['pass']
            .' path:' . $parts['path']
            .' query:' . $parts['query']
            .' fragment:' . $parts['fragment']); */
        
        if (isset($parts['path'])) {
            while (strpos($parts['path'], '/./') !== FALSE) {
                $parts['path'] = strtr($parts['path'], array('/./' => '/'));
            }
            $i = 0;
            while (strpos($parts['path'], '/../') !== FALSE) {
                $parts['path'] = preg_replace('/\\/[^\\/]*\\/\\.\\.\\//', '/', $parts['path']);
                $parts['path'] = preg_replace('/^\\/\\.\\.\\//', '/', $parts['path']);
                // Prevent infinite loops if we get some crazy url.
                if ($i++ > 100) exit;
            }
        }
        
        $absoluteUrl = $parts['scheme'] . '://' . $parts['host'];
        $absoluteUrl .= isset($parts['path']) ? $parts['path'] : '';
        $absoluteUrl .= isset($parts['query']) ? '?' . $parts['query'] : '';
        $absoluteUrl .= isset($parts['fragment']) ? '#'.$parts['fragment'] : '';
        
        if (
          $this->_isFrontier &&
          substr($absoluteUrl, 0, strlen($this->_baseUrl)) == $this->_baseUrl &&
          in_array(basename($parts['path']), $this->_frontierIndexFiles)
        ) {
            $parts['path'] = isset($parts['path']) ? dirname($parts['path']) . '/' : '';
            while (substr($parts['path'], 0, 2) == '//') {
              $parts['path'] = substr($parts['path'], 1);
            }
            
            $absoluteUrl = $parts['scheme'] . '://' . $parts['host'];
            $absoluteUrl .= $parts['path'];
            $absoluteUrl .= isset($parts['query']) ? '?' . $parts['query'] : '';
            $absoluteUrl .= isset($parts['fragment']) ? '#'.$parts['fragment'] : '';
        }
        
        return $absoluteUrl;
    }
    
    /**
     * Given an absolute URL $href, returns a URL that is relative to $baseUrl
     * @param string $href
     * @param string[optional] $baseUrl
     */
    private function _makeRelativeUrl($href, $baseUrl = '') {
      if (!$baseUrl) {
        $baseUrl = url('<front>', array('absolute' => TRUE));
      }
      
      if (substr($href, 0, strlen($baseUrl)) == $baseUrl) {
        if (variable_get('unl_use_base_tag', TRUE)) {
          return substr($href, strlen($baseUrl));
        } else {
          $parts = parse_url($href);
          $relativeUrl = $parts['path'];
          $relativeUrl .= isset($parts['query']) ? '?' . $parts['query'] : '';
          $relativeUrl .= isset($parts['fragment']) ? '#'.$parts['fragment'] : '';
          return $relativeUrl;
        }
      }
      return $href;
    }
    
    private function _createPage($title, $content, $alias = '', $makeFrontPage = FALSE)
    {
        
        if (substr($alias, -1) == '/') {
            $alias = substr($alias, 0, -1);
        }
        
        $node = new StdClass();
        $node->uid = $GLOBALS['user']->uid;
        $node->type = 'page';
        $node->title = $title;
        $node->language = 'und';
        $node->path['alias'] = $alias;
        if (module_exists('pathauto')) {
          $node->path['pathauto'] = FALSE;
        }
        
        $filter_format_keys = array_keys(filter_formats());
        $node->body = array(
          'und' => array(
            array(
              'value' => $content,
              'format' => array_shift($filter_format_keys),
            ),
          ),
        );
        
        node_submit($node);
        try {
            node_save($node);
        } catch (Exception $e) {
            $this->_log('Could not save page at ' . $alias . '. This is probably a case sensitivity conflict.', WATCHDOG_ERROR);
            return;
        }
        
        if (isset($this->_lastModifications[$alias])) {
            $mtime = $this->_lastModifications[$alias];
            $mtimes = array(
                'created' => $mtime,
                'changed' => $mtime
            );
            $result = db_update('node')
                ->fields($mtimes)
                ->condition('nid', $node->nid)
                ->execute();
        }
        
        $this->_nodeMap[$node->nid] = $alias;
        
        if ($makeFrontPage) {
            variable_set('site_frontpage', 'node/' . $node->nid);
            variable_set('site_name', $title);
        }
        
        $this->_log('Created page "' . $title . '" with node id ' . $node->nid . ' at ' . $alias . '.');
    }
    
    private function _getUrl($url)
    {
        if (!$this->_curl) {
          $this->_curl = curl_init();
        }
        $url = strtr($url, array(' ' => '%20'));
        curl_setopt($this->_curl, CURLOPT_URL, $url);
        curl_setopt($this->_curl, CURLOPT_RETURNTRANSFER, TRUE);
        curl_setopt($this->_curl, CURLOPT_HEADER, TRUE);
        curl_setopt($this->_curl, CURLOPT_NOBODY, TRUE);
        
        $data = curl_exec($this->_curl);
        $meta = curl_getinfo($this->_curl);
        
        $rawHeaders = substr($data, 0, $meta['header_size']);
        $rawHeaders = trim($rawHeaders);
        $rawHeaders = explode("\n", $rawHeaders);
        array_shift($rawHeaders);
        $headers = array();
        foreach ($rawHeaders as $rawHeader) {
            $splitPos = strpos($rawHeader, ':');
            $headerKey = substr($rawHeader, 0, $splitPos);
            $headerValue = substr($rawHeader, $splitPos+1);
            $headers[$headerKey] = trim($headerValue);
        }
        
        // don't copy files greater than 10MB in size
        if (isset($headers['Content-Length']) && $headers['Content-Length'] > (10 * 1024 * 1024)) {
            $size = floor($headers['Content-Length'] / (1024 * 1024)); 
            $this->_log("The file at $url is $size MB!  Ignoring.", WATCHDOG_ERROR);
            $content = '';
        } else {
            curl_setopt($this->_curl, CURLOPT_NOBODY, FALSE);
            $data = curl_exec($this->_curl);
            $meta = curl_getinfo($this->_curl);
            $content = substr($data, $meta['header_size']);
        }
        
        
        if (in_array($meta['http_code'], array(301, 302))) {
            $location = $headers['Location'];
            $path = substr($location, strlen($this->_baseUrl));
            $this->_addSitePath($path);
            
            if (substr($location, 0, strlen($this->_baseUrl)) == $this->_baseUrl) {
                $this->_redirects[substr($url, strlen($this->_baseUrl))] = substr($location, strlen($this->_baseUrl));
            } else {
                $this->_redirects[substr($url, strlen($this->_baseUrl))] = $location;
            }
            
            $this->_log('Found a redirect from ' . $url . ' to ' . $location . '. Some links may need to be updated.', WATCHDOG_WARNING);
            return FALSE;
        } else if ($meta['http_code'] != 200) {
            $this->_log('HTTP ' . $meta['http_code'] . ' while fetching ' . $url . '. Possible dead link.', WATCHDOG_ERROR);
            return FALSE;
        }
        
        $data = array(
            'content' => $content,
            'contentType' => $meta['content_type'],
        );
        
        if ($this->_frontierPath) {
            $mtime = $this->_getModifiedDate($url);
            if ($mtime) {
                $data['lastModified'] = $mtime;
            } else if (isset($headers['Last-Modified'])) {
                $data['lastModified'] = strtotime($headers['Last-Modified']);
            }
        }
    
        // Convert non-UTF-8 data to UTF-8.
        if (preg_match('/charset=(.*);?/', $data['contentType'], $matches)) {
            $charset = $matches[1];
            $data['content'] = iconv($charset, 'UTF-8', $data['content']);
        }
        
        return $data;
    }
    
    private function _getModifiedDate($url)
    {
        if (!$this->_frontierConnect()) {
            return NULL;
        }
        
        //Don't want url encoded chars like %20 in ftp file path 
        $url = urldecode($url);
        
        $path = substr($url, strlen($this->_baseUrl));
        if ($path[0] != '/') {
            $path = '/'.$path;
        }
        
        $ftpPath = $this->_frontierPath . $path;
        $ftpPaths = array();
        if (substr($ftpPath, -1) == '/') {
            foreach ($this->_frontierIndexFiles as $frontierIndexFile) {
                $ftpPaths[] = $ftpPath . $frontierIndexFile;
            }
        } else {
            $ftpPaths[] = $ftpPath;
        }
        
        foreach ($ftpPaths as $ftpPath) {
            $files = ftp_rawlist($this->_frontier, $ftpPath);
            if (isset($files[0])) {
                break;
            }
        }
        if (!isset($files[0])) {
            return NULL;
        }
        $mtime = substr($files[0], 43, 12);
        $mtime = strtotime($mtime);
        return $mtime;
    }
    
    private function _frontierConnect()
    {
        if (!$this->_isFrontier || !$this->_frontierPath) {
            return NULL;
        }
        
        if (!$this->_frontier) {
            $this->_frontier = ftp_ssl_connect('frontier.unl.edu');
            //TODO: make this a login that only has read access to everything.
            $login = ftp_login($this->_frontier, $this->_frontierUser, $this->_frontierPass);
            if (!$login) {
                $this->_frontier = NULL;
                $this->_log('Could not connect to frontier with user ' . $this->_frontierUser . '.', WATCHDOG_ERROR);
            }
            ftp_pasv($this->_frontier, TRUE);
        }
        return $this->_frontier;
    }
    
    private function _frontierScan($path, $time_limit)
    {
        if (!$this->_frontierConnect()) {
            return TRUE;
        }
        
        $ftpPath = $this->_frontierPath . $path;
        $rawFileList = ftp_rawlist($this->_frontier, $ftpPath);
        $fileList = ftp_nlist($this->_frontier, $ftpPath);
        $files = array();
        foreach ($rawFileList as $index => $rawListing) {
            $file = substr($fileList[$index], strlen($ftpPath));
            
            if (time() - $this->_start_time > $time_limit) {
                return FALSE;
            }
            
            if (in_array($path . $file, $this->_frontierFilesScanned)) {
                continue;
            }
            
            if (in_array($file, array('_notes', '_baks'))) {
                continue;
            }
            
            if (substr($rawListing, 0, 1) == 'd') {
                //file is a directory
                if (!$this->_frontierScan($path . $file . '/',  $time_limit)) {
                    return FALSE;
                };
            } else {
                if (substr($path, 0, 1) == '/') {
                    $path = substr($path, 1);
                }
                $files[] = $file;
                if (in_array($file, $this->_frontierIndexFiles)) {
                    $this->_addSitePath($path);
                } else {
                    $this->_addSitePath($path . $file);
                }
            }
            $this->_frontierFilesScanned[] = $path . $file;
        }
        return TRUE;
    }
    
    private function _log($message, $severity = WATCHDOG_INFO)
    {
      $this->_log[] = $message;
      
      if ($severity == WATCHDOG_INFO) {
        $type = 'status';
      }
      else if ($severity == WATCHDOG_WARNING) {
        $type = 'warning';
      }
      else {
        $type = 'error';
      }
      drupal_set_message($message, $type, FALSE);
      
      watchdog('unl migration', $message, NULL, $severity);
    }

  private function _get_instance_editable_content($html, $name) {
    $start_token = '<!-- InstanceBeginEditable name="' . $name . '" -->';
    $end_token = '<!-- InstanceEndEditable -->';
    
    if ($content = $this->_get_text_between_tokens($html, $start_token, $end_token)) {
      return $content;
    }
    
    
    $start_token = '<!-- TemplateBeginEditable name="' . $name . '" -->';
    $end_token = '<!-- TemplateEndEditable -->';
    
    if ($content = $this->_get_text_between_tokens($html, $start_token, $end_token)) {
      return $content;
    }
    
    return FALSE;
  }
  
  private function _get_old_main_content_area(&$html) {
    $start_token = '<!--THIS IS THE MAIN CONTENT AREA -->';
    $end_token = '<!--THIS IS THE END OF THE MAIN CONTENT AREA.-->';
    
    $content = $this->_get_text_between_tokens($html, $start_token, $end_token);
    
    $html = strtr($html, array(
      $start_token => $start_token . '<div id="maincontent">',
      $end_token   => $end_token . '</div>'
    ));
    
    return $content;
  }
  
  private function _get_text_between_tokens($text, $start_token, $end_token, $tidy_output = TRUE) {
    $content_start = strpos($text, $start_token);
    $content_end = strpos($text, $end_token, $content_start);
    $content = substr($text,
                      $content_start + strlen($start_token),
                      $content_end - $content_start - strlen($start_token));
    $content = trim($content);
  
    if ($content && $content_start !== FALSE && $content_end !== FALSE) {
      if ($tidy_output) {
        $content = $this->_tidy_html_fragment($content);
      }
      return $content;
    }
    
    return FALSE;
  }
  
  private function _tidy_html_fragment($html) {
    $config = array(
      'doctype' => 'transitional',
      'indent' => TRUE,
      'output-xhtml' => TRUE,
      'show-body-only' => TRUE,
      'wrap' => 0,
    );
    $tidy = new Tidy();
    $tidy->parseString($html, $config, 'utf8');
    $tidy->cleanRepair();
    
    return (string) $tidy;
  }
  
  static public function save_to_disk(Unl_Migration_Tool $instance)
  {
    $migration_storage_file = drupal_tempnam(file_directory_temp(), 'unl_migration_');
    file_put_contents($migration_storage_file, serialize($instance));
    if (PHP_SAPI == 'cli') {
      chmod($migration_storage_file, 0666);
    }
    return $migration_storage_file;
  }
  
  static public function load_from_disk($migration_storage_file) {
    $instance = unserialize(file_get_contents($migration_storage_file));
    unlink($migration_storage_file);
    return $instance;
  }
}