<?php function unl_migration($form, &$form_state) { if ($form_state['rebuild']) { $form['root'] = array( '#type' => 'fieldset', '#title' => 'This is taking a while. Click continue.' ); $form['root']['submit'] = array( '#type' => 'submit', '#value' => 'Continue', ); return $form; } $form['root'] = array( '#type' => 'fieldset', '#title' => 'Migration Tool', ); $form['root']['site_url'] = array( '#type' => 'textfield', '#title' => t('Site URL'), '#description' => t('Full URL to the existing site you wish to migrate'), '#required' => TRUE ); $form['root']['frontier_path'] = array( '#type' => 'textfield', '#title' => t('Frontier FTP Path'), '#description' => t('Full path to the root of your site on frontier (if applicable).'), '#required' => FALSE ); $form['root']['frontier_user'] = array( '#type' => 'textfield', '#title' => t('Frontier FTP Username'), '#required' => FALSE ); $form['root']['frontier_pass'] = array( '#type' => 'password', '#title' => t('Frontier FTP Password'), '#required' => FALSE ); $form['root']['ignore_duplicates'] = array( '#type' => 'checkbox', '#title' => t('Ignore Duplicate Pages/Files'), '#description' => t("This may be needed if your site has an unlimited number of dynamicly generated paths."), ); $form['submit'] = array( '#type' => 'submit', '#value' => 'Migrate' ); return $form; } function unl_migration_submit($form, &$form_state) { $migration = new Unl_Migration_Tool( $form_state['values']['site_url'], $form_state['values']['frontier_path'], $form_state['values']['frontier_user'], $form_state['values']['frontier_pass'], $form_state['values']['ignore_duplicates'] ); $operations = array( array( 'unl_migration_step', array( $migration, ) ) ); $batch = array( 'operations' => $operations, 'file' => substr(__FILE__, strlen(DRUPAL_ROOT) + 1), ); batch_set($batch); } function unl_migration_step($migration, &$context) { $finished = 0; if (isset($context['sandbox']['file']) && file_exists($context['sandbox']['file'])) { $migration = Unl_Migration_Tool::load_from_disk($context['sandbox']['file']); $finished = $context['sandbox']['finished']; } if ($migration->migrate()) { $context['finished'] = 1; return; } $finished += 0.01; if ($finished > 0.99) { $finished = 0.99; } $context['finished'] = $finished; $context['sandbox']['finished'] = $finished; $context['sandbox']['file'] = Unl_Migration_Tool::save_to_disk($migration); } function unl_migration_queue_step($migration_storage_file) { $migration = Unl_Migration_Tool::load_from_disk($migration_storage_file); if ($migration->migrate(30)) { return TRUE; } DrupalQueue::get('unl_migration', TRUE) ->createItem(Unl_Migration_Tool::save_to_disk($migration)); return FALSE; } class Unl_Migration_Tool { /** * base url to the site to migrate, eg: http://www.unl.edu/band/ * * @var string */ private $_baseUrl; /** * base path to frontier dir, eg: /cwis/data/band * * @var string */ private $_frontierPath; private $_frontierUser; private $_frontierPass; private $_frontier; private $_curl; private $_siteMap = array(); private $_processedPages = array(); private $_processedPageHashes = array(); private $_content = array(); private $_createdContent = array(); private $_lastModifications = array(); private $_redirects = array(); private $_hrefTransform = array(); private $_hrefTransformFiles = array(); private $_menu = array(); private $_breadcrumbs = array(); private $_nodeMap = array(); private $_pageTitles = array(); private $_pageParentLinks = array(); private $_log = array(); private $_blocks = array(); private $_isFrontier = FALSE; private $_frontierIndexFiles = array('low_bandwidth.shtml', 'index.shtml', 'index.html', 'index.htm', 'default.shtml'); private $_frontierFilesScanned = array(); private $_ignoreDuplicates = FALSE; /** * Keep track of the state of the migration progress so that we can resume later * @var int */ public $_state = self::STATE_NONE; const STATE_NONE = 1; const STATE_PROCESSING_BLOCKS = 2; const STATE_PROCESSING_PAGES = 3; const STATE_CREATING_NODES = 4; const STATE_DONE = 5; private $_start_time; public function __construct($baseUrl, $frontierPath, $frontierUser, $frontierPass, $ignoreDuplicates) { header('Content-type: text/plain'); // Check to see if we're migrating from frontier so we can make some extra assumptions. $baseUrlParts = parse_url($baseUrl); $remoteHostname = @gethostbyaddr(gethostbyname($baseUrlParts['host'])); if ($remoteHostname == 'frontier.unl.edu') { $this->_isFrontier = TRUE; } // Add trailing slash if necessary $baseUrl = trim($baseUrl); if (substr($baseUrl, -1) != '/') { $baseUrl .= '/'; } $frontierPath = trim ($frontierPath); if ($frontierPath && substr($frontierPath, -1) != '/') { $frontierPath .= '/'; } $this->_frontierPath = $frontierPath; $this->_frontierUser = $frontierUser; $this->_frontierPass = $frontierPass; $this->_ignoreDuplicates = (bool) $ignoreDuplicates; $this->_baseUrl = $baseUrl; $this->_addSitePath(''); } public function migrate($time_limit = 5) { if (!$this->_sanity_check()) { return TRUE; } $this->_start_time = time(); ini_set('memory_limit', -1); if ($this->_state == self::STATE_NONE) { if (!$this->_frontierScan('', $time_limit)) { return FALSE; } $this->_state = self::STATE_PROCESSING_BLOCKS; if (time() - $this->_start_time > $time_limit) { return FALSE; } } if ($this->_state == self::STATE_PROCESSING_BLOCKS) { // Parse the menu $this->_processMenu(); $this->_process_blocks(); $this->_process_breadcrumbs(); $this->_state = self::STATE_PROCESSING_PAGES; } if ($this->_state == self::STATE_PROCESSING_PAGES) { // Process all of the pages on the site (Takes a while) do { set_time_limit(30); $pagesToProcess = $this->_getPagesToProcess(); foreach ($pagesToProcess as $pageToProcess) { if (time() - $this->_start_time > $time_limit) { return FALSE; } $this->_processPage($pageToProcess); } } while (count($pagesToProcess) > 0); // Fix any links to files that got moved to sites/<site>/files foreach ($this->_hrefTransform as $path => &$transforms) { if (array_key_exists('', $transforms)) { unset($transforms['']); } foreach ($transforms as $oldPath => &$newPath) { if (array_key_exists($newPath, $this->_redirects)) { $newPath = $this->_redirects[$newPath]; } if (array_key_exists($newPath, $this->_hrefTransformFiles)) { $newPath = $this->_hrefTransformFiles[$newPath]; } } } $this->_state = self::STATE_CREATING_NODES; } if ($this->_state == self::STATE_CREATING_NODES) { // Update links and then create new page nodes. (Takes a while) foreach ($this->_content as $path => $content) { if (in_array($path, $this->_createdContent, TRUE)) { continue; } if (time() - $this->_start_time > $time_limit) { return FALSE; } set_time_limit(30); $hrefTransform = isset($this->_hrefTransform[$path]) ? $this->_hrefTransform[$path] : array(); $content = strtr($content, $hrefTransform); $pageTitle = $this->_pageTitles[$path]; $this->_createPage($pageTitle, $content, $path, '' == $path); $this->_createdContent[] = $path; } $this->_createMenu(); $this->_create_blocks(); $this->_create_breadcrumbs(); $this->_state = self::STATE_DONE; } return TRUE; } private function _sanity_check() { if (!$this->_getUrl($this->_baseUrl)) { form_set_error('unl', 'The specified site does not exist!'); return FALSE; } return TRUE; } private function _addSitePath($path) { if (($fragmentStart = strrpos($path, '#')) !== FALSE) { $path = substr($path, 0, $fragmentStart); } $this->_siteMap[hash('SHA256', $path)] = $path; } private function _getPagesToProcess() { $pagesToProcess = array(); foreach ($this->_siteMap as $path) { if (in_array($path, $this->_processedPages)) { continue; } $pagesToProcess[] = $path; } return $pagesToProcess; } private function _addProcessedPage($path) { $this->_processedPages[hash('SHA256', $path)] = $path; } private function _processMenu() { $content = $this->_getUrl($this->_baseUrl); $html = $content['content']; $dom = new DOMDocument(); $dom->loadHTML($html); $navlinksNode = $dom->getElementById('navigation'); if (!$navlinksNode) { return; } // Check to see if there's a base tag on this page. $base_tags = $dom->getElementsByTagName('base'); $page_base = NULL; if ($base_tags->length > 0) { $page_base = $base_tags->item(0)->getAttribute('href'); } $linkNodes = $navlinksNode->getElementsByTagName('a'); foreach ($linkNodes as $linkNode) { $this->_processLinks($linkNode->getAttribute('href'), '', $page_base, '<menu>'); } $navlinksUlNode = $navlinksNode->getElementsByTagName('ul')->item(0); foreach ($navlinksUlNode->childNodes as $primaryLinkLiNode) { if (strtolower($primaryLinkLiNode->nodeName) != 'li') { continue; } $primaryLinkNode = $primaryLinkLiNode->getElementsByTagName('a')->item(0); $menuItem = array('text' => trim($primaryLinkNode->textContent), 'href' => $this->_makeLinkAbsolute($primaryLinkNode->getAttribute('href'), '')); $childLinksUlNode = $primaryLinkLiNode->getElementsByTagName('ul')->item(0); if (!$childLinksUlNode) { $this->_menu[] = $menuItem; continue; } $childMenu = array(); foreach ($childLinksUlNode->childNodes as $childLinkLiNode) { if (strtolower($childLinkLiNode->nodeName) != 'li') { continue; } $childLinkNode = $childLinkLiNode->getElementsByTagName('a')->item(0); // If somebody left this menu item empty, skip it. Liferay, I'm looking at you! if (!$childLinkNode || !$childLinkNode->hasAttribute('href')) { continue; } $childMenu[] = array('text' => trim($childLinkNode->textContent), 'href' => $this->_makeLinkAbsolute($childLinkNode->getAttribute('href'), '')); } $menuItem['children'] = $childMenu; $this->_menu[] = $menuItem; } if (count($this->_menu) == 0) { $this->_log('Could not find the navigation menu for your site!', WATCHDOG_ERROR); } } private function _createMenu() { // Start off by removing the "Home" menu link if it exists. $menu_links = menu_load_links('main-menu'); foreach ($menu_links as $menu_link) { if ($menu_link['plid'] == 0 && $menu_link['link_title'] == 'Home' && $menu_link['link_path'] == '<front>') { menu_link_delete($menu_link['mlid']); } } // Now recursively create each menu. $primaryWeights = 1; foreach ($this->_menu as $primaryMenu) { $item = array( 'expanded' => TRUE, 'menu_name' => 'main-menu', 'link_title' => $primaryMenu['text'], 'link_path' => '', 'weight' => $primaryWeights++ ); $href = $primaryMenu['href']; if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) { $path = substr($href, strlen($this->_baseUrl)); if (!$path) { $path = ''; } if ($fragmentPos = strrpos($path, '#') !== FALSE) { $item['options']['fragment'] = substr($path, $fragmentPos + 1); $path = substr($path, 0, $fragmentPos); } if (substr($path, -1) == '/') { $path = substr($path, 0, -1); } $nodeId = array_search($path, $this->_nodeMap, TRUE); if ($nodeId) { $item['link_path'] = 'node/' . $nodeId; } } else { $item['link_path'] = $href; } if ($item['link_path']) { menu_link_save($item); $this->_log('Created menu item "' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.'); } else { $this->_log('Could not find a node to link to the ' . $item['link_title'] . ' menu item.', WATCHDOG_ERROR); continue; } if (!array_key_exists('children', $primaryMenu)) { continue; } $plid = $item['mlid']; $parentTitle = $item['link_title']; $childWeights = 1; foreach ($primaryMenu['children'] as $childMenu) { $item = array( 'menu_name' => 'main-menu', 'link_title' => $childMenu['text'], 'link_path' => '', 'plid' => $plid, 'weight' => $childWeights++ ); $href = $childMenu['href']; if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) { $path = substr($href, strlen($this->_baseUrl)); if (!$path) { $path = ''; } if (($fragmentPos = strrpos($path, '#')) !== FALSE) { $item['options']['fragment'] = substr($path, $fragmentPos + 1); $path = substr($path, 0, $fragmentPos); } if (substr($path, -1) == '/') { $path = substr($path, 0, -1); } $nodeId = array_search($path, $this->_nodeMap, TRUE); if ($nodeId) { $item['link_path'] = 'node/' . $nodeId; } } else { $item['link_path'] = $href; } if ($item['link_path']) { menu_link_save($item); $this->_log('Created menu item "' . $parentTitle . ' / ' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.'); } else { $this->_log('Could not find a node to link to the "' . $parentTitle . ' / ' . $item['link_title'] . '" menu.', WATCHDOG_ERROR); } } } // Now set up the site hierarchy $pageParentLinks = $this->_pageParentLinks; foreach ($this->_pageParentLinks as $path => $parentLink) { $this->_createParentLink($path, $parentLink); } } private function _createParentLink($childPath, $parentPath) { // If the child is the site root, just return the root mlid. if (!$childPath) { return 0; } // If the child link already exists, just return its mlid. $childLink = menu_link_get_preferred(drupal_get_normal_path(rtrim($childPath, '/'))); if ($childLink && $childLink['link_path'] != 'node/%') { return $childLink['mlid']; } // Find the parent link, if it doesn't exist, recursively create it. $parentNodePath = drupal_get_normal_path(rtrim($parentPath, '/')); $parentLink = menu_link_get_preferred($parentNodePath); if ($parentLink) { $parentLinkId = $parentLink['mlid']; } else if (substr($parentNodePath, 0, 5) != 'node/') { // This will catch invalid breadcrumb links and change them to point to the site root. $parentLink = ''; $parentLinkId = 0; } else { $parentLinkId = $this->_createParentLink($parentPath, $this->_pageParentLinks[$parentPath]); } // Create the menu item. $item = array( 'menu_name' => 'main-menu', 'link_title' => $this->_pageTitles[$childPath], 'link_path' => drupal_get_normal_path(rtrim($childPath, '/')), 'plid' => $parentLinkId, 'weight' => 50, 'hidden' => 1, ); menu_link_save($item); // Return its mlid. return $item['mlid']; } private function _process_blocks() { $content = $this->_getUrl($this->_baseUrl); $html = $content['content']; $this->_blocks['related_links'] = $this->_get_instance_editable_content($html, 'leftcollinks'); $this->_blocks['contact_info'] = $this->_get_instance_editable_content($html, 'contactinfo'); $this->_blocks['optional_footer'] = $this->_get_instance_editable_content($html, 'optionalfooter'); $this->_blocks['footer_content'] = $this->_get_instance_editable_content($html, 'footercontent'); // Filter out the existing headers. $this->_blocks['related_links'] = preg_replace('/\s*<h3>\s*Related Links\s*<\/h3>\s*/', '', $this->_blocks['related_links']); $this->_blocks['contact_info'] = preg_replace('/\s*<h3>\sContacting Us*\s*<\/h3>\s*/', '', $this->_blocks['contact_info']); $this->_blocks['contact_info'] = preg_replace('/\s*<h3>\s*Contact Us\s*<\/h3>\s*/', '', $this->_blocks['contact_info']); } private function _create_blocks() { db_update('block_custom') ->fields(array( 'body' => $this->_blocks['contact_info'], )) ->condition('bid', 101) ->execute(); db_update('block_custom') ->fields(array( 'body' => $this->_blocks['related_links'], )) ->condition('bid', 102) ->execute(); db_update('block_custom') ->fields(array( 'body' => $this->_blocks['optional_footer'], )) ->condition('bid', 103) ->execute(); db_update('block_custom') ->fields(array( 'body' => $this->_blocks['footer_content'], )) ->condition('bid', 104) ->execute(); } private function _process_breadcrumbs() { $content = $this->_getUrl($this->_baseUrl); $html = $content['content']; $dom = new DOMDocument(); $dom->loadHTML($html); $breadcrumbs_node = $dom->getElementById('breadcrumbs'); if (!$breadcrumbs_node) { return; } $link_nodes = $breadcrumbs_node->getElementsByTagName('a'); $list_nodes = $breadcrumbs_node->getElementsByTagName('li'); $unlinked_node = FALSE; if ($list_nodes->length > $link_nodes->length) { $unlinked_node = TRUE; } // Scan each of the breadcrumb links, skipping the first and the last (but only if there's an un-linked "true" last breadcrumb) for ($i = 1; $i < $link_nodes->length - ($unlinked_node ? 1 : 0); $i++) { $link_node = $link_nodes->item($i); $this->_breadcrumbs[] = array( 'text' => trim($link_node->textContent), 'href' => $this->_makeLinkAbsolute($link_node->getAttribute('href', '')) ); } } private function _create_breadcrumbs() { $current_settings = variable_get('theme_unl_wdn_settings', array()); $current_settings['intermediate_breadcrumbs'] = $this->_breadcrumbs; variable_set('theme_unl_wdn_settings', $current_settings); } private function _processPage($path) { $this->_addProcessedPage($path); $fullPath = $this->_baseUrl . $path; $url = $this->_baseUrl . $path; $data = $this->_getUrl($url); if (!$data['content']) { $this->_log('The file at ' . $fullPath . ' was empty! Ignoring.', WATCHDOG_ERROR); return; } $pageHash = hash('md5', $data['content']); if (($matchingPath = array_search($pageHash, $this->_processedPageHashes)) !== FALSE) { $logMessage = "The file found at $fullPath was a duplicate of the file at {$this->_baseUrl}$matchingPath !"; if ($this->_ignoreDuplicates) { $this->_log($logMessage . ' Ignoring.', WATCHDOG_WARNING); return; } else { $this->_log($logMessage, WATCHDOG_WARNING); } } $this->_processedPageHashes[$path] = $pageHash; if (isset($data['lastModified'])) { $this->_lastModifications[$path] = $data['lastModified']; } if (strpos($data['contentType'], 'html') === FALSE) { if (!$data['contentType']) { $this->_log('The file type at ' . $fullPath . ' was not specified. Ignoring.', WATCHDOG_ERROR); return; } @drupal_mkdir('public://' . urldecode(dirname($path)), NULL, TRUE); if (!mb_check_encoding($path, 'UTF-8')) { $path = iconv('ISO-8859-1', 'UTF-8', $path); } try { $file = file_save_data($data['content'], 'public://' . urldecode($path), FILE_EXISTS_REPLACE); } catch (Exception $e) { $this->_log('Could not migrate file "' . $path . '"! File name too long?', WATCHDOG_ERROR); } $this->_hrefTransformFiles[$path] = $this->_makeRelativeUrl(file_create_url('public://' . $path)); return; } $html = $data['content']; $maincontentarea = $this->_get_instance_editable_content($html, 'maincontentarea'); if (!$maincontentarea) { $maincontentarea = $this->_get_old_main_content_area($html); } if (!$maincontentarea) { $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Using entire body.', WATCHDOG_WARNING); $maincontentarea = $this->_get_text_between_tokens($html, '<body>', '</body>'); } if (!$maincontentarea) { // its possible the body tag has attributes. Check for this and filter them out. $maincontentarea = $this->_get_text_between_tokens($html, '<body', '</body>', FALSE); // As long as we find a closing bracket before the next opening bracket, its probably safe to assume the body tag is intact. if (strpos($maincontentarea, '>') < strpos($maincontentarea, '<')) { $maincontentarea = trim(substr($maincontentarea, strpos($maincontentarea, '>') + 1)); // Tidy the output here, otherwise tidy would see HTML starting in the middle of a <body key="val"> tag. $maincontentarea = $this->_tidy_html_fragment($maincontentarea); // Otherwise, ignore it all. (Will be an issue if the body has no other tags, but how likely is this?) } else { $maincontentarea = ''; } } if (!$maincontentarea) { $this->_log('The file at ' . $fullPath . ' has no valid body. Ignoring.', WATCHDOG_ERROR); return; } $dom = new DOMDocument(); @$dom->loadHTML($html); // Check to see if there's a base tag on this page. $base_tags = $dom->getElementsByTagName('base'); $page_base = NULL; if ($base_tags->length > 0) { $page_base = $base_tags->item(0)->getAttribute('href'); } $pageTitle = ''; $pageTitleNode = $dom->getElementById('pagetitle'); if ($pageTitleNode) { $pageTitleH2Nodes = $pageTitleNode->getElementsByTagName('h2'); if ($pageTitleH2Nodes->length > 0) { $pageTitle = $pageTitleH2Nodes->item(0)->textContent; } } // If there is no WDN compliant title, search for others if (!$pageTitle) { // First, check for a WDN compliant <title> $titleText = ''; $titleNodes = $dom->getElementsByTagName('title'); if ($titleNodes->length > 0) { $titleText = $titleNodes->item(0)->textContent; } $titleParts = explode('|', $titleText); if (count($titleParts) > 2) { $pageTitle = trim(array_pop($titleParts)); } // Finally, combine what title does exist with the last part of the path else { $filename = trim($path, '/'); $filename = explode('/', $filename); $filename = array_pop($filename); // Strip off a file extension if it exists. if (strrpos($filename, '.') !== FALSE) { $filename = substr($filename, 0, strrpos($filename, '.')); } $pageTitle = "$titleText ($filename)"; } } if (!$pageTitle) { $this->_log('No page title was found at ' . $fullPath . '.', WATCHDOG_ERROR); $pageTitle = 'Untitled'; } $maincontentNode = $dom->getElementById('maincontent'); if (!$maincontentNode) { $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Using entire body.', WATCHDOG_WARNING); $bodyNodes = $dom->getElementsByTagName('body'); if ($bodyNodes->length == 0) { $this->_log('The file at ' . $fullPath . ' has no valid body. Ignoring.', WATCHDOG_ERROR); return; } $maincontentNode = $bodyNodes->item(0); } $linkNodes = $maincontentNode->getElementsByTagName('a'); foreach ($linkNodes as $linkNode) { $this->_processLinks($linkNode->getAttribute('href'), $path, $page_base); } $linkNodes = $maincontentNode->getElementsByTagName('img'); foreach ($linkNodes as $linkNode) { $this->_processLinks($linkNode->getAttribute('src'), $path, $page_base); } $this->_content[$path] = $maincontentarea; $this->_pageTitles[$path] = $pageTitle; // Scan the page for the parent breadcrumb $breadcrumbs = $dom->getElementById('breadcrumbs'); if ($breadcrumbs) { $breadcrumbs = $breadcrumbs->getElementsByTagName('a'); $breadcrumb = $breadcrumbs->item($breadcrumbs->length - 1); if ($breadcrumb) { $breadcrumb = $breadcrumb->getAttribute('href'); $breadcrumb = $this->_makeLinkAbsolute($breadcrumb, $path); if (substr($breadcrumb, 0, strlen($this->_baseUrl)) == $this->_baseUrl && $breadcrumb != $this->_baseUrl) { $pageParentLink = substr($breadcrumb, strlen($this->_baseUrl)); } else { $pageParentLink = ''; } if ($pageParentLink == $path) { $pageParentLink = ''; } $this->_pageParentLinks[$path] = $pageParentLink; } } } private function _processLinks($originalHref, $path, $page_base = NULL, $tag = NULL) { if (substr($originalHref, 0, 1) == '#') { return; } if (!$page_base) { $page_base = $path; } $href = $this->_makeLinkAbsolute($originalHref, $page_base); if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) { $newPath = substr($href, strlen($this->_baseUrl)); if ($newPath === FALSE) { $newPath = ''; } if ($tag) { $this->_hrefTransform[$tag][$originalHref] = $newPath; } else { $this->_hrefTransform[$path][$originalHref] = $newPath; } $this->_addSitePath($newPath); } } private function _makeLinkAbsolute($href, $path) { $path_parts = parse_url($path); if (isset($path_parts['scheme'])) { $base_url = $path; $path = ''; } else { $base_url = $this->_baseUrl; } if (substr($path, -1) == '/') { $intermediatePath = $path; } else { $intermediatePath = dirname($path); } if ($intermediatePath == '.') { $intermediatePath = ''; } if (strlen($intermediatePath) > 0 && substr($intermediatePath, -1) != '/') { $intermediatePath .= '/'; } $parts = parse_url($href); if (isset($parts['scheme']) && !in_array($parts['scheme'], array('http', 'https'))) { return $href; } if (isset($parts['scheme'])) { $absoluteUrl = $href; } else if (isset($parts['path']) && substr($parts['path'], 0, 1) == '/') { $baseParts = parse_url($this->_baseUrl); $absoluteUrl = $baseParts['scheme'] . '://' . $baseParts['host'] . $parts['path']; if (isset($parts['fragment'])) { $absoluteUrl .= '#' . $parts['fragment']; } } else if (substr($href, 0, 1) == '#') { $absoluteUrl = $this->_baseUrl . $path . $href; } else { $absoluteUrl = $this->_baseUrl . $intermediatePath . $href; } $parts = parse_url($absoluteUrl); /* $this->_log('Absolute URL ' . $absoluteUrl . ' converted to parts:' .' scheme:' . $parts['scheme'] .' host:' . $parts['host'] .' port:' . $parts['port'] .' user:' . $parts['user'] .' pass:' . $parts['pass'] .' path:' . $parts['path'] .' query:' . $parts['query'] .' fragment:' . $parts['fragment']); */ if (isset($parts['path'])) { while (strpos($parts['path'], '/./') !== FALSE) { $parts['path'] = strtr($parts['path'], array('/./' => '/')); } $i = 0; while (strpos($parts['path'], '/../') !== FALSE) { $parts['path'] = preg_replace('/\\/[^\\/]*\\/\\.\\.\\//', '/', $parts['path']); $parts['path'] = preg_replace('/^\\/\\.\\.\\//', '/', $parts['path']); // Prevent infinite loops if we get some crazy url. if ($i++ > 100) exit; } } $absoluteUrl = $parts['scheme'] . '://' . $parts['host']; $absoluteUrl .= isset($parts['path']) ? $parts['path'] : ''; $absoluteUrl .= isset($parts['query']) ? '?' . $parts['query'] : ''; $absoluteUrl .= isset($parts['fragment']) ? '#'.$parts['fragment'] : ''; if ( $this->_isFrontier && substr($absoluteUrl, 0, strlen($this->_baseUrl)) == $this->_baseUrl && in_array(basename($parts['path']), $this->_frontierIndexFiles) ) { $parts['path'] = isset($parts['path']) ? dirname($parts['path']) . '/' : ''; while (substr($parts['path'], 0, 2) == '//') { $parts['path'] = substr($parts['path'], 1); } $absoluteUrl = $parts['scheme'] . '://' . $parts['host']; $absoluteUrl .= $parts['path']; $absoluteUrl .= isset($parts['query']) ? '?' . $parts['query'] : ''; $absoluteUrl .= isset($parts['fragment']) ? '#'.$parts['fragment'] : ''; } return $absoluteUrl; } /** * Given an absolute URL $href, returns a URL that is relative to $baseUrl * @param string $href * @param string[optional] $baseUrl */ private function _makeRelativeUrl($href, $baseUrl = '') { if (!$baseUrl) { $baseUrl = url('<front>', array('absolute' => TRUE)); } if (substr($href, 0, strlen($baseUrl)) == $baseUrl) { if (variable_get('unl_use_base_tag', TRUE)) { return substr($href, strlen($baseUrl)); } else { $parts = parse_url($href); $relativeUrl = $parts['path']; $relativeUrl .= isset($parts['query']) ? '?' . $parts['query'] : ''; $relativeUrl .= isset($parts['fragment']) ? '#'.$parts['fragment'] : ''; return $relativeUrl; } } return $href; } private function _createPage($title, $content, $alias = '', $makeFrontPage = FALSE) { if (substr($alias, -1) == '/') { $alias = substr($alias, 0, -1); } $node = new StdClass(); $node->uid = $GLOBALS['user']->uid; $node->type = 'page'; $node->title = $title; $node->language = 'und'; $node->path['alias'] = $alias; if (module_exists('pathauto')) { $node->path['pathauto'] = FALSE; } $filter_format_keys = array_keys(filter_formats()); $node->body = array( 'und' => array( array( 'value' => $content, 'format' => array_shift($filter_format_keys), ), ), ); node_submit($node); try { node_save($node); } catch (Exception $e) { $this->_log('Could not save page at ' . $alias . '. This is probably a case sensitivity conflict.', WATCHDOG_ERROR); return; } if (isset($this->_lastModifications[$alias])) { $mtime = $this->_lastModifications[$alias]; $mtimes = array( 'created' => $mtime, 'changed' => $mtime ); $result = db_update('node') ->fields($mtimes) ->condition('nid', $node->nid) ->execute(); } $this->_nodeMap[$node->nid] = $alias; if ($makeFrontPage) { variable_set('site_frontpage', 'node/' . $node->nid); variable_set('site_name', $title); } $this->_log('Created page "' . $title . '" with node id ' . $node->nid . ' at ' . $alias . '.'); } private function _getUrl($url) { if (!$this->_curl) { $this->_curl = curl_init(); } $url = strtr($url, array(' ' => '%20')); curl_setopt($this->_curl, CURLOPT_URL, $url); curl_setopt($this->_curl, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($this->_curl, CURLOPT_HEADER, TRUE); curl_setopt($this->_curl, CURLOPT_NOBODY, TRUE); $data = curl_exec($this->_curl); $meta = curl_getinfo($this->_curl); $rawHeaders = substr($data, 0, $meta['header_size']); $rawHeaders = trim($rawHeaders); $rawHeaders = explode("\n", $rawHeaders); array_shift($rawHeaders); $headers = array(); foreach ($rawHeaders as $rawHeader) { $splitPos = strpos($rawHeader, ':'); $headerKey = substr($rawHeader, 0, $splitPos); $headerValue = substr($rawHeader, $splitPos+1); $headers[$headerKey] = trim($headerValue); } // don't copy files greater than 10MB in size if (isset($headers['Content-Length']) && $headers['Content-Length'] > (10 * 1024 * 1024)) { $size = floor($headers['Content-Length'] / (1024 * 1024)); $this->_log("The file at $url is $size MB! Ignoring.", WATCHDOG_ERROR); $content = ''; } else { curl_setopt($this->_curl, CURLOPT_NOBODY, FALSE); $data = curl_exec($this->_curl); $meta = curl_getinfo($this->_curl); $content = substr($data, $meta['header_size']); } if (in_array($meta['http_code'], array(301, 302))) { $location = $headers['Location']; $path = substr($location, strlen($this->_baseUrl)); $this->_addSitePath($path); if (substr($location, 0, strlen($this->_baseUrl)) == $this->_baseUrl) { $this->_redirects[substr($url, strlen($this->_baseUrl))] = substr($location, strlen($this->_baseUrl)); } else { $this->_redirects[substr($url, strlen($this->_baseUrl))] = $location; } $this->_log('Found a redirect from ' . $url . ' to ' . $location . '. Some links may need to be updated.', WATCHDOG_WARNING); return FALSE; } else if ($meta['http_code'] != 200) { $this->_log('HTTP ' . $meta['http_code'] . ' while fetching ' . $url . '. Possible dead link.', WATCHDOG_ERROR); return FALSE; } $data = array( 'content' => $content, 'contentType' => $meta['content_type'], ); if ($this->_frontierPath) { $mtime = $this->_getModifiedDate($url); if ($mtime) { $data['lastModified'] = $mtime; } else if (isset($headers['Last-Modified'])) { $data['lastModified'] = strtotime($headers['Last-Modified']); } } // Convert non-UTF-8 data to UTF-8. if (preg_match('/charset=(.*);?/', $data['contentType'], $matches)) { $charset = $matches[1]; $data['content'] = iconv($charset, 'UTF-8', $data['content']); } return $data; } private function _getModifiedDate($url) { if (!$this->_frontierConnect()) { return NULL; } //Don't want url encoded chars like %20 in ftp file path $url = urldecode($url); $path = substr($url, strlen($this->_baseUrl)); if ($path[0] != '/') { $path = '/'.$path; } $ftpPath = $this->_frontierPath . $path; $ftpPaths = array(); if (substr($ftpPath, -1) == '/') { foreach ($this->_frontierIndexFiles as $frontierIndexFile) { $ftpPaths[] = $ftpPath . $frontierIndexFile; } } else { $ftpPaths[] = $ftpPath; } foreach ($ftpPaths as $ftpPath) { $files = ftp_rawlist($this->_frontier, $ftpPath); if (isset($files[0])) { break; } } if (!isset($files[0])) { return NULL; } $mtime = substr($files[0], 43, 12); $mtime = strtotime($mtime); return $mtime; } private function _frontierConnect() { if (!$this->_isFrontier || !$this->_frontierPath) { return NULL; } if (!$this->_frontier) { $this->_frontier = ftp_ssl_connect('frontier.unl.edu'); //TODO: make this a login that only has read access to everything. $login = ftp_login($this->_frontier, $this->_frontierUser, $this->_frontierPass); if (!$login) { $this->_frontier = NULL; $this->_log('Could not connect to frontier with user ' . $this->_frontierUser . '.', WATCHDOG_ERROR); } ftp_pasv($this->_frontier, TRUE); } return $this->_frontier; } private function _frontierScan($path, $time_limit) { if (!$this->_frontierConnect()) { return TRUE; } $ftpPath = $this->_frontierPath . $path; $rawFileList = ftp_rawlist($this->_frontier, $ftpPath); $fileList = ftp_nlist($this->_frontier, $ftpPath); $files = array(); foreach ($rawFileList as $index => $rawListing) { $file = substr($fileList[$index], strlen($ftpPath)); if (time() - $this->_start_time > $time_limit) { return FALSE; } if (in_array($path . $file, $this->_frontierFilesScanned)) { continue; } if (in_array($file, array('_notes', '_baks'))) { continue; } if (substr($rawListing, 0, 1) == 'd') { //file is a directory if (!$this->_frontierScan($path . $file . '/', $time_limit)) { return FALSE; }; } else { if (substr($path, 0, 1) == '/') { $path = substr($path, 1); } $files[] = $file; if (in_array($file, $this->_frontierIndexFiles)) { $this->_addSitePath($path); } else { $this->_addSitePath($path . $file); } } $this->_frontierFilesScanned[] = $path . $file; } return TRUE; } private function _log($message, $severity = WATCHDOG_INFO) { $this->_log[] = $message; if ($severity == WATCHDOG_INFO) { $type = 'status'; } else if ($severity == WATCHDOG_WARNING) { $type = 'warning'; } else { $type = 'error'; } drupal_set_message($message, $type, FALSE); watchdog('unl migration', $message, NULL, $severity); } private function _get_instance_editable_content($html, $name) { $start_token = '<!-- InstanceBeginEditable name="' . $name . '" -->'; $end_token = '<!-- InstanceEndEditable -->'; if ($content = $this->_get_text_between_tokens($html, $start_token, $end_token)) { return $content; } $start_token = '<!-- TemplateBeginEditable name="' . $name . '" -->'; $end_token = '<!-- TemplateEndEditable -->'; if ($content = $this->_get_text_between_tokens($html, $start_token, $end_token)) { return $content; } return FALSE; } private function _get_old_main_content_area(&$html) { $start_token = '<!--THIS IS THE MAIN CONTENT AREA -->'; $end_token = '<!--THIS IS THE END OF THE MAIN CONTENT AREA.-->'; $content = $this->_get_text_between_tokens($html, $start_token, $end_token); $html = strtr($html, array( $start_token => $start_token . '<div id="maincontent">', $end_token => $end_token . '</div>' )); return $content; } private function _get_text_between_tokens($text, $start_token, $end_token, $tidy_output = TRUE) { $content_start = strpos($text, $start_token); $content_end = strpos($text, $end_token, $content_start); $content = substr($text, $content_start + strlen($start_token), $content_end - $content_start - strlen($start_token)); $content = trim($content); if ($content && $content_start !== FALSE && $content_end !== FALSE) { if ($tidy_output) { $content = $this->_tidy_html_fragment($content); } return $content; } return FALSE; } private function _tidy_html_fragment($html) { $config = array( 'doctype' => 'transitional', 'indent' => TRUE, 'output-xhtml' => TRUE, 'show-body-only' => TRUE, 'wrap' => 0, ); $tidy = new Tidy(); $tidy->parseString($html, $config, 'utf8'); $tidy->cleanRepair(); return (string) $tidy; } static public function save_to_disk(Unl_Migration_Tool $instance) { $migration_storage_file = drupal_tempnam(file_directory_temp(), 'unl_migration_'); file_put_contents($migration_storage_file, serialize($instance)); if (PHP_SAPI == 'cli') { chmod($migration_storage_file, 0666); } return $migration_storage_file; } static public function load_from_disk($migration_storage_file) { $instance = unserialize(file_get_contents($migration_storage_file)); unlink($migration_storage_file); return $instance; } }