unl_migration.php 55.4 KB
Newer Older
1
2
3
4
<?php

function unl_migration($form, &$form_state)
{
5
6
7
8
9
10
11
12
13
14
    if ($form_state['rebuild']) {
        $form['root'] = array(
            '#type' => 'fieldset',
            '#title' => 'This is taking a while.  Click continue.'
        );
        $form['root']['submit'] = array(
            '#type' => 'submit',
            '#value' => 'Continue',
        );
        return $form;
15
16
17
    }


18
19
    $form['root'] = array(
        '#type' => 'fieldset',
20
        '#title' => 'Migration Tool',
21
    );
22

23
24
25
26
27
28
    $form['root']['site_url'] = array(
        '#type' => 'textfield',
        '#title' => t('Site URL'),
        '#description' => t('Full URL to the existing site you wish to migrate'),
        '#required' => TRUE
    );
29

30
31
32
33
34
35
    $form['root']['frontier_path'] = array(
        '#type' => 'textfield',
        '#title' => t('Frontier FTP Path'),
        '#description' => t('Full path to the root of your site on frontier (if applicable).'),
        '#required' => FALSE
    );
36
37
38
39
40
41
42
43
44
45
    $form['root']['frontier_user'] = array(
        '#type' => 'textfield',
        '#title' => t('Frontier FTP Username'),
        '#required' => FALSE
    );
    $form['root']['frontier_pass'] = array(
        '#type' => 'password',
        '#title' => t('Frontier FTP Password'),
        '#required' => FALSE
    );
46
47
48
49
50
    $form['root']['ignore_duplicates'] = array(
        '#type' => 'checkbox',
        '#title' => t('Ignore Duplicate Pages/Files'),
        '#description' => t("This may be needed if your site has an unlimited number of dynamicly generated paths."),
    );
51
    $form['root']['use_liferay_code'] = array(
52
53
54
55
56
57
58
59
60
61
      '#type' => 'checkbox',
      '#title' => t('Use Liferay Detection'),
      '#description' => t("Normally, this won't interfere with non-liferay sites. If you have a /web directory, you should turn this off."),
      '#default_value' => 1,
    );
    $form['root']['use_liferay_titles'] = array(
      '#type' => 'checkbox',
      '#title' => t('Use Liferay Titles'),
      '#description' => t("Liferay doesn't use WDN compliant page titles. This enables their alternate method of finding the page title."),
      '#default_value' => 0,
62
    );
63

64
65
66
67
    $form['submit'] = array(
        '#type' => 'submit',
        '#value' => 'Migrate'
    );
68

69
70
71
    return $form;
}

72
function unl_migration_submit($form, &$form_state) {
73

74
75
76
77
78
  $migration = new Unl_Migration_Tool(
    $form_state['values']['site_url'],
    $form_state['values']['frontier_path'],
    $form_state['values']['frontier_user'],
    $form_state['values']['frontier_pass'],
79
    $form_state['values']['ignore_duplicates'],
80
81
    $form_state['values']['use_liferay_code'],
    $form_state['values']['use_liferay_titles']
82
  );
83

84
85
86
87
  $operations = array(
    array(
      'unl_migration_step',
      array(
88
        $migration,
89
90
91
      )
    )
  );
92

93
  $batch = array(
94
95
    'operations' => $operations,
    'file' => substr(__FILE__, strlen(DRUPAL_ROOT) + 1),
96
97
98
99
  );
  batch_set($batch);
}

100
function unl_migration_step($migration, &$context)
101
102
103
{
  $finished = 0;
  if (isset($context['sandbox']['file']) && file_exists($context['sandbox']['file'])) {
104
    $migration = Unl_Migration_Tool::load_from_disk($context['sandbox']['file']);
105
  }
106
107
108
  if (!isset($context['sandbox']['duration'])) {
    $context['sandbox']['duration'] = 1;
  }
109

110
  if ($migration->migrate($context['sandbox']['duration'])) {
111
    $context['finished'] = 1;
112
    $context['message'] = $migration->getMessage();
113
114
    return;
  }
115

116
117
  $context['finished'] = $migration->getFinished();
  $context['message'] = $migration->getMessage();
118
  $context['sandbox']['file'] = Unl_Migration_Tool::save_to_disk($migration);
119
  $context['sandbox']['duration'] = min(300, ceil($context['sandbox']['duration'] * 1.5));
120
121
}

122

123
124
class Unl_Migration_Tool
{
125
126
127
128
129
    /**
     * base url to the site to migrate, eg: http://www.unl.edu/band/
     *
     * @var string
     */
130
    private $_baseUrl;
131
132
133
134
135
136

    /**
     * base path to frontier dir, eg: /cwis/data/band
     *
     * @var string
     */
137
    private $_frontierPath;
138
139
    private $_frontierUser;
    private $_frontierPass;
140
    private $_frontier;
141

142
    private $_curl;
143

144
145
146
147
148
149
    private $_siteMap             = array();
    private $_processedPages      = array();
    private $_processedPageHashes = array();
    private $_content             = array();
    private $_createdContent      = array();
    private $_lastModifications   = array();
150
    private $_redirects           = array();
151
152
153
    private $_hrefTransform       = array();
    private $_hrefTransformFiles  = array();
    private $_menu                = array();
154
    private $_breadcrumbs         = array();
155
156
    private $_nodeMap             = array();
    private $_pageTitles          = array();
157
    private $_pageParentLinks     = array();
158
159
    private $_log                 = array();
    private $_blocks              = array();
160
161
    private $_isFrontier          = FALSE;
    private $_frontierIndexFiles  = array('low_bandwidth.shtml', 'index.shtml', 'index.html', 'index.htm', 'default.shtml');
162
    private $_frontierFilesScanned = array();
163
    private $_ignoreDuplicates    = FALSE;
164
    private $_useLiferayCode      = TRUE;
165
166
    private $_useLiferayTitles    = FALSE;
    private $_liferayPageTitles   = array();
167
    private $_logger;
168

169
170
    private $_liferaySubsites     = array(
      'cropwatch.unl.edu'     => array('corn', 'drybeans', 'forages', 'organic', 'potato', 'sorghum', 'soybeans', 'wheat', 'bioenergy', 'insect', 'economics', 'ssm', 'soils', 'tillage', 'weed', 'varietytest', 'biotechnology', 'farmresearch', 'cropwatch-youth', 'militaryresources', 'gaps', 'sugarbeets'),
171
      '4h.unl.edu'            => array('extension-4-h-horse'),
172
      'animalscience.unl.edu' => array('fernando-lab', 'anscgenomics', 'rprb-lab', 'ruminutrition-lab', 'pre-vet-program'),
173
174
175
176
177
      'beef.unl.edu'          => array('cattleproduction'),
      'biochem.unl.edu'       => array('barycki', 'bailey', 'becker', 'adamec', 'wilson', 'biochem-fatttlab', 'simpson'),
      'bse.unl.edu'           => array('p2guidelines'),
      'edmedia.unl.edu'       => array('techtraining'),
      'food.unl.edu'          => array('localfoods', 'allergy', 'fnh', 'preservation', 'fpc', 'safety', 'meatproducts', 'youth'),
178
      'ianrhome.unl.edu'      => array('ianrinternational', 'liaison'),
179
180
      'water.unl.edu'         => array('crops', 'cropswater', 'drinkingwater', 'drought', 'wildlife', 'hydrology', 'lakes', 'landscapes', 'landscapewater', 'laweconomics', 'manure', 'propertydesign', 'research', 'sewage', 'students', 'watershed', 'wells', 'wetlands'),
      'westcentral.unl.edu'   => array('wcentomology', 'wcacreage'),
181
      'agecon.unl.edu'        => array('policy'),
182
183
    );

184
185
186
187
    /**
     * Keep track of the state of the migration progress so that we can resume later
     * @var int
     */
188
189
190
191
192
193
    public $_state = self::STATE_NONE;
    const STATE_NONE              = 1;
    const STATE_PROCESSING_BLOCKS = 2;
    const STATE_PROCESSING_PAGES  = 3;
    const STATE_CREATING_NODES    = 4;
    const STATE_DONE              = 5;
194

195
    private $_start_time;
196

197
    public function __construct($baseUrl, $frontierPath, $frontierUser, $frontierPass, $ignoreDuplicates, $useLiferayCode = FALSE, $useLiferayTitles = FALSE)
198
    {
199
200
        // Check to see if we're migrating from frontier so we can make some extra assumptions.
        $baseUrlParts = parse_url($baseUrl);
201
        $remoteHostname = @gethostbyaddr(gethostbyname($baseUrlParts['host']));
202
203
204
        if ($remoteHostname == 'frontier.unl.edu') {
            $this->_isFrontier = TRUE;
        }
205

206
        // Add trailing slash if necessary
207
208
        $baseUrl = trim($baseUrl);
        if (substr($baseUrl, -1) != '/') {
209
            $baseUrl .= '/';
210
        }
211

212
213
214
215
        $frontierPath = trim ($frontierPath);
        if ($frontierPath && substr($frontierPath, -1) != '/') {
          $frontierPath .= '/';
        }
216

217
        $this->_frontierPath = $frontierPath;
218
219
        $this->_frontierUser = $frontierUser;
        $this->_frontierPass = $frontierPass;
220

221
        $this->_ignoreDuplicates = (bool) $ignoreDuplicates;
222
        $this->_useLiferayCode = (bool) $useLiferayCode;
223
        $this->_useLiferayTitles = (bool) $useLiferayTitles;
224

225
226
        $this->_baseUrl = $baseUrl;
        $this->_addSitePath('');
227
228
    }

229
    public function migrate($time_limit = 5)
230
    {
231
232
233
      if (!$this->_sanity_check()) {
        return TRUE;
      }
234

235
236
      $this->_start_time = time();
      ini_set('memory_limit', -1);
237

238
239
240
      if ($this->_state == self::STATE_NONE) {
        if (!$this->_frontierScan('', $time_limit)) {
          return FALSE;
241
        }
242

243
244
245
        $this->_state = self::STATE_PROCESSING_BLOCKS;
        if (time() - $this->_start_time > $time_limit) {
          return FALSE;
246
        }
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
      }

      if ($this->_state == self::STATE_PROCESSING_BLOCKS) {
        // Parse the menu
        $this->_processMenu();
        $this->_process_blocks();
        $this->_process_breadcrumbs();
        $this->_process_liferay_sitemap();
        $this->_state = self::STATE_PROCESSING_PAGES;
      }

      if ($this->_state == self::STATE_PROCESSING_PAGES) {
        // Process all of the pages on the site (Takes a while)
        do {
          set_time_limit(max(30, $time_limit * 1.5));

          $pagesToProcess = $this->_getPagesToProcess();
          foreach ($pagesToProcess as $pageToProcess) {
            if (time() - $this->_start_time > $time_limit) {
              return FALSE;
267
            }
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
            try {
              $this->_processPage($pageToProcess);
            }
            catch (Exception $e) {
              $this->_log('An exception occured while processing "' . $pageToProcess . '": "' . $e->getMessage() . '".', WATCHDOG_ERROR);
            }
          }
        } while (count($pagesToProcess) > 0);


        // Fix any links to files that got moved to sites/<site>/files
        foreach ($this->_hrefTransform as $path => &$transforms) {
          if (array_key_exists('', $transforms)) {
            unset($transforms['']);
          }
          foreach ($transforms as $oldPath => &$newPath) {
            if (array_key_exists($newPath, $this->_redirects)) {
              $newPath = $this->_redirects[$newPath];
            }
            if (array_key_exists($newPath, $this->_hrefTransformFiles)) {
              $newPath = $this->_hrefTransformFiles[$newPath];
            }
          }
291
        }
292

293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
        $this->_state = self::STATE_CREATING_NODES;
      }


      if ($this->_state == self::STATE_CREATING_NODES) {
        // Update links and then create new page nodes. (Takes a while)
        foreach ($this->_content as $path => $content) {
          if (in_array($path, $this->_createdContent, TRUE)) {
            continue;
          }
          if (time() - $this->_start_time > $time_limit) {
            return FALSE;
          }
          set_time_limit(max(30, $time_limit * 1.5));

          $hrefTransforms = isset($this->_hrefTransform[$path]) ? $this->_hrefTransform[$path] : array();
          foreach ($hrefTransforms as $hrefTransformFrom => $hrefTransformTo) {
            $content = str_replace(htmlspecialchars($hrefTransformFrom), htmlspecialchars($hrefTransformTo), $content);
          }

          $pageTitle = $this->_pageTitles[$path];
          try {
            $this->_createPage($pageTitle, $content, $path, '' == $path);
          }
          catch (Exception $e) {
            $this->_log('An exception occured while creating "' . $path . '": "' . $e->getMessage() . '".', WATCHDOG_ERROR);
          }
          $this->_createdContent[] = $path;
        }
322

323
324
325
        $this->_createMenu();
        $this->_create_blocks();
        $this->_create_breadcrumbs();
326

327
328
        $this->_state = self::STATE_DONE;
      }
329

330
      return TRUE;
331
332
    }

333
334
335
336
337
338
339
    private function _sanity_check() {
      if (!$this->_getUrl($this->_baseUrl)) {
        form_set_error('unl', 'The specified site does not exist!');
        return FALSE;
      }
      return TRUE;
    }
340
341
    
    private function _addSitePath($path, $allowTralingSlash = FALSE, $caseSensitive = FALSE)
342
    {
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
      // Blacklist any liferay calendars to avoid crawling an infinite number of pages
      if ($this->_useLiferayCode && strpos($path, 'struts_action=%2Fcalendar%2Fview') !== FALSE) {
        return;
      }
      
      if (($fragmentStart = strrpos($path, '#')) !== FALSE) {
          $path = substr($path, 0, $fragmentStart);
      }
      if ($allowTralingSlash) {
        $path = trim($path, ' ');
      }
      else {
        $path = trim($path, '/ ');
      }
      if (array_search(strtolower($path), array_map('strtolower', $this->_siteMap)) !== FALSE && !$caseSensitive) {
        return;
      }
      $this->_siteMap[hash('SHA256', $path)] = $path;
361
    }
362

363
364
365
366
367
368
369
370
371
372
373
    private function _getPagesToProcess()
    {
        $pagesToProcess = array();
        foreach ($this->_siteMap as $path) {
            if (in_array($path, $this->_processedPages)) {
                continue;
            }
            $pagesToProcess[] = $path;
        }
        return $pagesToProcess;
    }
374

375
376
377
378
    private function _addProcessedPage($path)
    {
        $this->_processedPages[hash('SHA256', $path)] = $path;
    }
379

380
381
    private function _processMenu()
    {
382
        $content = $this->_getUrl($this->_baseUrl);
383
        $html = $content['content'];
384

385
386
387
        $dom = new DOMDocument();
        $dom->loadHTML($html);
        $navlinksNode = $dom->getElementById('navigation');
388
389
390
        if (!$navlinksNode) {
          return;
        }
391

392
393
394
395
396
397
        // Check to see if there's a base tag on this page.
        $base_tags = $dom->getElementsByTagName('base');
        $page_base = NULL;
        if ($base_tags->length > 0) {
          $page_base = $base_tags->item(0)->getAttribute('href');
        }
398

399
400
        $linkNodes = $navlinksNode->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
401
            $this->_processLinks($linkNode->getAttribute('href'), '', $page_base, '<menu>');
402
        }
403

404
405
        $navlinksUlNode = $navlinksNode->getElementsByTagName('ul')->item(0);
        foreach ($navlinksUlNode->childNodes as $primaryLinkLiNode) {
406
407
408
409
410
411
            if (strtolower($primaryLinkLiNode->nodeName) != 'li') {
                continue;
            }
            $primaryLinkNode = $primaryLinkLiNode->getElementsByTagName('a')->item(0);
            $menuItem = array('text' => trim($primaryLinkNode->textContent),
                              'href' => $this->_makeLinkAbsolute($primaryLinkNode->getAttribute('href'), ''));
412

413
414
            $childLinksUlNode = $primaryLinkLiNode->getElementsByTagName('ul')->item(0);
            if (!$childLinksUlNode) {
415
                $this->_menu[] = $menuItem;
416
417
418
419
420
421
422
423
                continue;
            }
            $childMenu = array();
            foreach ($childLinksUlNode->childNodes as $childLinkLiNode) {
                if (strtolower($childLinkLiNode->nodeName) != 'li') {
                    continue;
                }
                $childLinkNode = $childLinkLiNode->getElementsByTagName('a')->item(0);
424
425
426
427
                // If somebody left this menu item empty, skip it.  Liferay, I'm looking at you!
                if (!$childLinkNode || !$childLinkNode->hasAttribute('href')) {
                    continue;
                }
428
429
430
431
                $childMenu[] = array('text' => trim($childLinkNode->textContent),
                                     'href' => $this->_makeLinkAbsolute($childLinkNode->getAttribute('href'), ''));
            }
            $menuItem['children'] = $childMenu;
432
433
            $this->_menu[] = $menuItem;
        }
434

435
        if (count($this->_menu) == 0) {
436
            $this->_log('Could not find the navigation menu for your site!', WATCHDOG_ERROR);
437
        }
438
    }
439

440
441
    private function _createMenu()
    {
442
443
444
445
446
447
448
449
450
        // Start off by removing the "Home" menu link if it exists.
        $menu_links = menu_load_links('main-menu');
        foreach ($menu_links as $menu_link) {
          if ($menu_link['plid'] == 0 &&
              $menu_link['link_title'] == 'Home' &&
              $menu_link['link_path'] == '<front>') {
            menu_link_delete($menu_link['mlid']);
          }
        }
451

452
        // Now recursively create each menu.
453
        $primaryWeights = 1;
454
455
456
457
        foreach ($this->_menu as $primaryMenu) {
            $item = array(
                'expanded' => TRUE,
                'menu_name' => 'main-menu',
458
                'link_title' => $primaryMenu['text'],
459
                'link_path' => '',
460
                'weight' => $primaryWeights++
461
462
            );
            $href = $primaryMenu['href'];
463
464
465
466
467
            if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) {
                $path = substr($href, strlen($this->_baseUrl));
                if (!$path) {
                    $path = '';
                }
468
                $path = trim($path, '/');
469

470
                if ($fragmentPos = strrpos($path, '#') !== FALSE) {
471
                    $item['options']['fragment'] = substr($path, $fragmentPos + 1);
472
                    $path = substr($path, 0, $fragmentPos);
473
474
                }
                if (substr($path, -1) == '/') {
475
                    $path = substr($path, 0, -1);
476
                }
477
                $nodeId = array_search(strtolower($path), array_map('strtolower', $this->_nodeMap), TRUE);
478
479
                if ($nodeId) {
                    $item['link_path'] = 'node/' . $nodeId;
480
                }
481
            } else {
482
                $item['link_path'] = $href;
483
            }
484

485
            if ($item['link_path']) {
486
              try {
487
488
                menu_link_save($item);
                $this->_log('Created menu item "' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.');
489
490
491
              }
              catch (Exception $e) {
                $this->_log('An exception occured creating the menu link for "' . $item['link_title'] . '".', WATCHDOG_ERROR);
492
                continue;
493
494
495
496
              }
            } else {
              $this->_log('Could not find a node to link to the ' . $item['link_title'] . ' menu item.', WATCHDOG_ERROR);
              continue;
497
            }
498

499
            if (!array_key_exists('children', $primaryMenu)) {
500
                continue;
501
            }
502

503
            $plid = $item['mlid'];
504
            $parentTitle = $item['link_title'];
505
            $childWeights = 1;
506
            foreach ($primaryMenu['children'] as $childMenu) {
507
508
509
510
511
                $item = array(
                    'menu_name' => 'main-menu',
                    'link_title' => $childMenu['text'],
                    'link_path' => '',
                    'plid' => $plid,
512
                    'weight' => $childWeights++
513
514
515
516
517
518
519
                );
                $href = $childMenu['href'];
                if (substr($href, 0, strlen($this->_baseUrl)) == $this->_baseUrl) {
                    $path = substr($href, strlen($this->_baseUrl));
                    if (!$path) {
                        $path = '';
                    }
520
                    $path = trim($path, '/');
521

522
                    if (($fragmentPos = strrpos($path, '#')) !== FALSE) {
523
                        $item['options']['fragment'] = substr($path, $fragmentPos + 1);
524
525
526
527
528
                        $path = substr($path, 0, $fragmentPos);
                    }
                    if (substr($path, -1) == '/') {
                        $path = substr($path, 0, -1);
                    }
529
                    $nodeId = array_search(strtolower($path), array_map('strtolower', $this->_nodeMap), TRUE);
530
531
532
                    if ($nodeId) {
                        $item['link_path'] = 'node/' . $nodeId;
                    }
533
534
535
                } else {
                    $item['link_path'] = $href;
                }
536

537
                if ($item['link_path']) {
538
                  try {
539
                    menu_link_save($item);
540
                    $this->_log('Created menu item "' . $parentTitle . ' / ' . $item['link_title'] . '" linked to ' . $item['link_path'] . '.');
541
542
543
544
                  }
                  catch (Exception $e) {
                    $this->_log('An exception occured creating the menu link for ' . $parentTitle . ' / ' . $item['link_title'] . '.', WATCHDOG_ERROR);
                  }
545
                } else {
546
                    $this->_log('Could not find a node to link to the "' . $parentTitle . ' / ' . $item['link_title'] . '" menu.', WATCHDOG_ERROR);
547
                }
548
549
            }
        }
550
551


552
553
554
555
556
        // Now set up the site hierarchy
        $pageParentLinks = $this->_pageParentLinks;
        foreach ($this->_pageParentLinks as $path => $parentLink) {
          $this->_createParentLink($path, $parentLink);
        }
557
558
    }

559
    private function _createParentLink($childPath, $parentPath) {
560

561
562
563
      // If the child is the site root, just return the root mlid.
      if (!$childPath) {
        return 0;
564
      }
565

566
567
568
569
570
      // If the child link already exists, just return its mlid.
      $childLink = menu_link_get_preferred(drupal_get_normal_path(rtrim($childPath, '/')));
      if ($childLink && $childLink['link_path'] != 'node/%') {
        return $childLink['mlid'];
      }
571

572
573
574
575
576
577
578
579
580
581
582
      // Find the parent link, if it doesn't exist, recursively create it.
      $parentNodePath = drupal_get_normal_path(rtrim($parentPath, '/'));
      $parentLink = menu_link_get_preferred($parentNodePath);
      if ($parentLink) {
        $parentLinkId = $parentLink['mlid'];
      } else if (substr($parentNodePath, 0, 5) != 'node/') {
        // This will catch invalid breadcrumb links and change them to point to the site root.
        $parentLink = '';
        $parentLinkId = 0;
      } else {
        $parentLinkId = $this->_createParentLink($parentPath, $this->_pageParentLinks[$parentPath]);
583
584
      }

585
      // Create the menu item.
586
587
      $item = array(
        'menu_name' => 'main-menu',
588
589
590
591
592
        'link_title' => $this->_pageTitles[$childPath],
        'link_path' => drupal_get_normal_path(rtrim($childPath, '/')),
        'plid' => $parentLinkId,
        'weight' => 50,
        'hidden' => 1,
593
      );
594
      menu_link_save($item);
595

596
597
      // Return its mlid.
      return $item['mlid'];
598
    }
599

600
601
602
    private function _process_blocks() {
      $content = $this->_getUrl($this->_baseUrl);
      $html = $content['content'];
603

604
605
606
607
      $this->_blocks['related_links'] = $this->_get_instance_editable_content($html, 'leftcollinks');
      $this->_blocks['contact_info'] = $this->_get_instance_editable_content($html, 'contactinfo');
      $this->_blocks['optional_footer'] = $this->_get_instance_editable_content($html, 'optionalfooter');
      $this->_blocks['footer_content'] = $this->_get_instance_editable_content($html, 'footercontent');
608

609
610
611
612
613
614
615
616
617
      foreach ($this->_blocks as $blockName => $block) {
        $dom = new DOMDocument();
        @$dom->loadHTML($block);
        $linkNodes = $dom->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
          $this->_processLinks($linkNode->getAttribute('href'), '', '', '<' . $blockName . '>');
        }
      }

618
619
620
621
      // Filter out the existing headers.
      $this->_blocks['related_links'] = preg_replace('/\s*<h3>\s*Related Links\s*<\/h3>\s*/', '', $this->_blocks['related_links']);
      $this->_blocks['contact_info'] = preg_replace('/\s*<h3>\sContacting Us*\s*<\/h3>\s*/', '', $this->_blocks['contact_info']);
      $this->_blocks['contact_info'] = preg_replace('/\s*<h3>\s*Contact Us\s*<\/h3>\s*/', '', $this->_blocks['contact_info']);
622
    }
623

624
    private function _create_blocks() {
625
626
627
628
629
630
631
632
633

      foreach ($this->_blocks as $blockName => $block) {
        if (!isset($this->_hrefTransform['<' . $blockName . '>'])) {
          continue;
        }
        foreach ($this->_hrefTransform['<' . $blockName . '>'] as $hrefTransformFrom => $hrefTransformTo) {
          $this->_blocks[$blockName] = str_replace(htmlspecialchars($hrefTransformFrom), htmlspecialchars($hrefTransformTo), $block);
        }
      }
634

635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['contact_info'],
        ))
        ->condition('bid', 101)
        ->execute();
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['related_links'],
        ))
        ->condition('bid', 102)
        ->execute();
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['optional_footer'],
        ))
        ->condition('bid', 103)
        ->execute();
      db_update('block_custom')
        ->fields(array(
          'body'   => $this->_blocks['footer_content'],
        ))
        ->condition('bid', 104)
        ->execute();
659
660
    }

661
662
663
    private function _process_breadcrumbs() {
      $content = $this->_getUrl($this->_baseUrl);
      $html = $content['content'];
664

665
666
667
668
669
670
      $dom = new DOMDocument();
      $dom->loadHTML($html);
      $breadcrumbs_node = $dom->getElementById('breadcrumbs');
      if (!$breadcrumbs_node) {
        return;
      }
671

672
673
674
675
676
677
      $link_nodes = $breadcrumbs_node->getElementsByTagName('a');
      $list_nodes = $breadcrumbs_node->getElementsByTagName('li');
      $unlinked_node = FALSE;
      if ($list_nodes->length > $link_nodes->length) {
        $unlinked_node = TRUE;
      }
678

679
680
681
682
683
      // Scan each of the breadcrumb links, skipping the first and the last (but only if there's an un-linked "true" last breadcrumb)
      for ($i = 1; $i < $link_nodes->length - ($unlinked_node ? 1 : 0); $i++) {
        $link_node = $link_nodes->item($i);
        $this->_breadcrumbs[] = array(
          'text' => trim($link_node->textContent),
684
          'href' => $this->_makeLinkAbsolute($link_node->getAttribute('href'), ''),
685
686
        );
      }
687
688
    }

689
690
691
692
693
    private function _create_breadcrumbs() {
      $current_settings = variable_get('theme_unl_wdn_settings', array());
      $current_settings['intermediate_breadcrumbs'] = $this->_breadcrumbs;
      variable_set('theme_unl_wdn_settings', $current_settings);
    }
694

695
696
697
698
699
    private function _process_liferay_sitemap() {
      if (!$this->_useLiferayCode) {
        return;
      }
      
700
701
702
703
704
705
706
707
      $urls = array();
      $urls[] = $this->_baseUrl . '?p_p_id=EXT_SITEMAP&p_p_state=exclusive&p_p_mode=view';
      
      $host = parse_url($this->_baseUrl, PHP_URL_HOST);
      if (array_key_exists($host, $this->_liferaySubsites)) {
        foreach ($this->_liferaySubsites[$host] as $subSite) {
          $urls[] = $this->_baseUrl . 'web/' . $subSite . '/?p_p_id=EXT_SITEMAP&p_p_state=exclusive&p_p_mode=view';
        }
708
709
      }
      
710
711
712
713
714
715
716
717
718
719
720
721
      foreach ($urls as $url) {
        $data = $this->_getUrl($url);
        
        if (strpos($data['contentType'], 'html') === FALSE) {
          return;
        }
  
        $dom = new DOMDocument();
        @$dom->loadHTML($data['content']);
        
        $linkNodes = $dom->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
722
723
724
725
          $path = $this->_processLinks($linkNode->getAttribute('href'), '');
          if ($this->_useLiferayTitles) {
            $this->_liferayPageTitles[$path] = trim($linkNode->textContent);
          }
726
        }
727
728
      }
    }
729

730
731
    private function _processPage($path)
    {
732
733
        $this->_addProcessedPage($path);
        $fullPath = $this->_baseUrl . $path;
734
735
        
        $this->_log('Processing page: ' . $path, WATCHDOG_DEBUG);
736

737
        $url = $this->_baseUrl . $path;
738

739
740
        $data = $this->_getUrl($url);
        if (!$data['content']) {
741
            $this->_log('The file at ' . $fullPath . ' was empty! Ignoring.', WATCHDOG_ERROR);
742
            return;
743
        }
744

745
746
        $pageHash = hash('md5', $data['content']);
        if (($matchingPath = array_search($pageHash, $this->_processedPageHashes)) !== FALSE) {
747
748
            $logMessage = "The file found at $fullPath was a duplicate of the file at {$this->_baseUrl}$matchingPath !";
            if ($this->_ignoreDuplicates) {
749
                $this->_log($logMessage . ' Ignoring.', WATCHDOG_WARNING);
750
751
                return;
            } else {
752
                $this->_log($logMessage, WATCHDOG_WARNING);
753
            }
754
        }
755
        $this->_processedPageHashes[$path] = $pageHash;
756

757
        if (isset($data['lastModified'])) {
758
            $this->_lastModifications[$path] = $data['lastModified'];
759
        }
760

761
762
763
764
        $cleanPath = $path;
        $pathParts = parse_url($path);
        // If the path contains a query, we'll have to change it.
        if (array_key_exists('query', $pathParts)) {
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
          // If a Content-Disposition header exists with a filename, grab it.
          $altFileName = '';
          $matches = array();
          if (array_key_exists('Content-Disposition', $data['headers']) &&
              preg_match('/filename="(.*)"/', $data['headers']['Content-Disposition'], $matches)) {
            $altFileName = $matches[1];
          }

          // Parse the query string
          $query = array();
          parse_str($pathParts['query'], $query);
          
          // If this is a liferay file, just save it as <uuid>.<ext> in the root files directory.
          if ($pathParts['path'] == 'c/document_library/get_file' && $query['uuid']) {
            if (strrpos($pathParts['query'], '.') > strrpos($pathParts['query'], '&') && strrpos($pathParts['query'], '.') !== FALSE) {
              $cleanPath = $query['uuid'] . substr($pathParts['query'], strrpos($pathParts['query'], '.'));
            }
            else if ($altFileName && strpos($altFileName, '.') !== FALSE) {
              $cleanPath = $query['uuid'] . substr($altFileName, strrpos($altFileName, '.'));
784
            } else {
785
              $cleanPath = $query['uuid'];
786
            }
787
788
789
790
791
792
793
794
795
796
          }
          // Or, if it exists, save it as the content-disposition name.
          else if ($altFileName) {
            $cleanPath = $pathParts['path'] . '/' . $altFileName;
          }
          // Otherwise, just save it with a / instead of a ?.
          else {
            $cleanPath = $pathParts['path'] . '/' . $pathParts['query'];
          }
          $cleanPath = strtr($cleanPath, array('%2f' => '/', '%2F' => '/'));
797
        }
798

799
800
801
802
        if (strpos($data['contentType'], 'html') === FALSE) {
          if (!$data['contentType']) {
            $this->_log('The file type at ' . $fullPath . ' was not specified. Ignoring.', WATCHDOG_ERROR);
            return;
803
          }
804

805
          @drupal_mkdir('public://' . urldecode(dirname($cleanPath)), NULL, TRUE);
806
          if (!mb_check_encoding($path, 'UTF-8')) {
807
              $path = iconv('ISO-8859-1', 'UTF-8', $path);
808
          }
809

810
          try {
811
            $file = file_save_data($data['content'], 'public://' . urldecode($cleanPath), FILE_EXISTS_REPLACE);
812
813
814
          } catch (Exception $e) {
            $this->_log('Could not migrate file "' . $path . '"! File name too long?', WATCHDOG_ERROR);
          }
815
          $this->_hrefTransformFiles[$path] = $this->_makeRelativeUrl(file_create_url('public://' . $cleanPath));
816
          return;
817
818
        }
        $html = $data['content'];
819

820
        $maincontentarea = '';
821

822
823
824
        if ($path != '') {
          $maincontentarea = $this->_get_liferay_content_area($html);
        }
825

826
827
828
        if (!$maincontentarea) {
          $maincontentarea = $this->_get_instance_editable_content($html, 'maincontentarea');
        }
829

830
831
832
        if (!$maincontentarea) {
            $maincontentarea = $this->_get_old_main_content_area($html);
        }
833

834
        if (!$maincontentarea) {
835
            $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Using entire body.', WATCHDOG_WARNING);
836
837
            $maincontentarea = $this->_get_text_between_tokens($html, '<body>', '</body>');
        }
838

839
840
        if (!$maincontentarea) {
            // its possible the body tag has attributes.  Check for this and filter them out.
841
            $maincontentarea = $this->_get_text_between_tokens($html, '<body', '</body>', FALSE);
842
            // As long as we find a closing bracket before the next opening bracket, its probably safe to assume the body tag is intact.
843
844
            if (strpos($maincontentarea, '>') < strpos($maincontentarea, '<')) {
              $maincontentarea = trim(substr($maincontentarea, strpos($maincontentarea, '>') + 1));
845
846
              // Tidy the output here, otherwise tidy would see HTML starting in the middle of a <body key="val"> tag.
              $maincontentarea = $this->_tidy_html_fragment($maincontentarea);
847
848
849
850
851
            // Otherwise, ignore it all. (Will be an issue if the body has no other tags, but how likely is this?)
            } else {
              $maincontentarea = '';
            }
        }
852

853
        if (!$maincontentarea) {
854
            $this->_log('The file at ' . $fullPath . ' has no valid body. Ignoring.', WATCHDOG_ERROR);
855
856
            return;
        }
857

858
        $dom = new DOMDocument();
859
        @$dom->loadHTML($html);
860

861
862
863
864
865
866
        // Check to see if there's a base tag on this page.
        $base_tags = $dom->getElementsByTagName('base');
        $page_base = NULL;
        if ($base_tags->length > 0) {
          $page_base = $base_tags->item(0)->getAttribute('href');
        }
867

868
        $pageTitle = '';
869
870
871
872
        if ($this->_liferayPageTitles[$path]) {
          $pageTitle = $this->_liferayPageTitles[$path];
        }

873
        $pageTitleNode = $dom->getElementById('pagetitle');
874
        if (!$pageTitle && $pageTitleNode) {
875
876
877
          // Search for the WDN 3.1 page title
          $pageTitleH1Nodes = $pageTitleNode->getElementsByTagName('h1');
          if ($pageTitleH1Nodes->length > 0) {
Tim Steiner's avatar
Tim Steiner committed
878
            $pageTitle = $pageTitleH1Nodes->item(0)->textContent;
879
880
881
          }
          if (!$pageTitle) {
            // If not found, search for the earlier version of the WDN page title
882
883
            $pageTitleH2Nodes = $pageTitleNode->getElementsByTagName('h2');
            if ($pageTitleH2Nodes->length > 0) {
Tim Steiner's avatar
Tim Steiner committed
884
              $pageTitle = $pageTitleH2Nodes->item(0)->textContent;
885
            }
886
          }
887
888
889
890

          if ($pageTitle && $this->_useLiferayTitles) {
            $pageTitle .= rtrim(' ' . basename($path));
          }
891
        }
892

893
        // If there is no WDN compliant title, search for others
894
        if (!$pageTitle) {
895
896
897
898
          // First, check for a WDN compliant <title>
          $titleText = '';
          $titleNodes = $dom->getElementsByTagName('title');
          if ($titleNodes->length > 0) {
899
            $titleText = $titleNodes->item(0)->textContent;
900
901
902
903
904
905
906
907
908
909
910
911
912
          }
          $titleParts = explode('|', $titleText);
          if (count($titleParts) > 2) {
            $pageTitle = trim(array_pop($titleParts));
          }
          // Finally, combine what title does exist with the last part of the path
          else {
            $filename = trim($path, '/');
            $filename = explode('/', $filename);
            $filename = array_pop($filename);
            // Strip off a file extension if it exists.
            if (strrpos($filename, '.') !== FALSE) {
              $filename = substr($filename, 0, strrpos($filename, '.'));
913
            }
914
915
            $pageTitle = "$titleText ($filename)";
          }
916
        }
917

918
        if (!$pageTitle) {
919
            $this->_log('No page title was found at ' . $fullPath . '.', WATCHDOG_ERROR);
920
921
            $pageTitle = 'Untitled';
        }
922

923
        $maincontentNode = $dom->getElementById('maincontent');
924
        if (!$maincontentNode) {
925
            $this->_log('The file at ' . $fullPath . ' has no valid maincontentarea. Using entire body.', WATCHDOG_WARNING);
926
927
            $bodyNodes = $dom->getElementsByTagName('body');
            if ($bodyNodes->length == 0) {
928
                $this->_log('The file at ' . $fullPath . ' has no valid body. Ignoring.', WATCHDOG_ERROR);
929
930
931
                return;
            }
            $maincontentNode = $bodyNodes->item(0);
932
        }
933

934
935
        $linkNodes = $maincontentNode->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
936
            $this->_processLinks($linkNode->getAttribute('href'), $path, $page_base);
937
        }
938

939
940
        $linkNodes = $maincontentNode->getElementsByTagName('img');
        foreach ($linkNodes as $linkNode) {
941
            $this->_processLinks($linkNode->getAttribute('src'), $path, $page_base);
942
        }
943

944
945
        $this->_content[$cleanPath] = $maincontentarea;
        $this->_pageTitles[$cleanPath] = $pageTitle;
946

947
948
949
        // Scan the page for the parent breadcrumb
        $breadcrumbs = $dom->getElementById('breadcrumbs');
        if ($breadcrumbs) {