View file common/Extractor.php

File size: 6.33Kb
<?php
/**
 * @author     Mike Cochrane <mikec@mikenz.geek.nz>
 * @author     Nick Pope <nick@nickpope.me.uk>
 * @copyright  Copyright © 2010, Mike Cochrane, Nick Pope
 * @license    http://www.apache.org/licenses/LICENSE-2.0  Apache License v2.0
 * @package    Twitter
 */

require_once 'Regex.php';

/**
 * Twitter Extractor Class
 *
 * Parses tweets and extracts URLs, usernames, username/list pairs and
 * hashtags.
 *
 * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
 * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
 * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
 *
 * @author     Mike Cochrane <mikec@mikenz.geek.nz>
 * @author     Nick Pope <nick@nickpope.me.uk>
 * @copyright  Copyright © 2010, Mike Cochrane, Nick Pope
 * @license    http://www.apache.org/licenses/LICENSE-2.0  Apache License v2.0
 * @package    Twitter
 */
class Twitter_Extractor extends Twitter_Regex {

  /**
   * Provides fluent method chaining.
   *
   * @param  string  $tweet        The tweet to be converted.
   *
   * @see  __construct()
   *
   * @return  Twitter_Extractor
   */
  public static function create($tweet) {
    return new self($tweet);
  }

  /**
   * Reads in a tweet to be parsed and extracts elements from it.
   *
   * Extracts various parts of a tweet including URLs, usernames, hashtags...
   *
   * @param  string  $tweet  The tweet to extract.
   */
  public function __construct($tweet) {
    parent::__construct($tweet);
  }

  /**
   * Extracts all parts of a tweet and returns an associative array containing
   * the extracted elements.
   *
   * @return  array  The elements in the tweet.
   */
  public function extract() {
    return array(
      'hashtags' => $this->extractHashtags(),
      'urls'     => $this->extractURLs(),
      'mentions' => $this->extractMentionedUsernames(),
      'replyto'  => $this->extractRepliedUsernames(),
      'hashtags_with_indices' => $this->extractHashtagsWithIndices(),
      'urls_with_indices'     => $this->extractURLsWithIndices(),
      'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices(),
    );
  }

  /**
   * Extracts all the hashtags from the tweet.
   *
   * @return  array  The hashtag elements in the tweet.
   */
  public function extractHashtags() {
    preg_match_all(self::REGEX_HASHTAG, $this->tweet, $matches);
    return $matches[3];
  }

  /**
   * Extracts all the URLs from the tweet.
   *
   * @return  array  The URL elements in the tweet.
   */
  public function extractURLs() {
    preg_match_all(self::$REGEX_VALID_URL, $this->tweet, $matches);
    list($all, $before, $url, $protocol, $domain, $path, $query) = array_pad($matches, 7, '');
    $i = count($url)-1;
    for (; $i >= 0; $i--) {
      if (!preg_match('!https?://!', $protocol[$i])) {
        # Note: $protocol can contain 'www.' if no protocol exists!
        if (preg_match(self::REGEX_PROBABLE_TLD, $domain[$i]) || strtolower($protocol[$i]) === 'www.') {
          $url[$i] = 'http://'.(strtolower($protocol[$i]) === 'www.' ? $protocol[$i] : '').$domain[$i];
        } else {
          unset($url[$i]);
        }
      }
    }
    # Renumber the array:
    return array_values($url);
  }

  /**
   * Extract all the usernames from the tweet.
   *
   * A mention is an occurrence of a username anywhere in a tweet.
   *
   * @return  array  The usernames elements in the tweet.
   */
  public function extractMentionedUsernames() {
    preg_match_all(self::REGEX_USERNAME_MENTION, $this->tweet, $matches);
    list($all, $before, $username, $after) = array_pad($matches, 4, '');
    $usernames = array();
    for ($i = 0; $i < count($username); $i ++) {
      # If $after is not empty, there is an invalid character.
      if (!empty($after[$i])) continue;
      array_push($usernames, $username[$i]);
    }
    return $usernames;
  }

  /**
   * Extract all the usernames replied to from the tweet.
   *
   * A reply is an occurrence of a username at the beginning of a tweet.
   *
   * @return  array  The usernames replied to in a tweet.
   */
  public function extractRepliedUsernames() {
    preg_match(self::$REGEX_REPLY_USERNAME, $this->tweet, $matches);
    return isset($matches[2]) ? $matches[2] : '';
  }

  /**
   * Extracts all the hashtags and the indices they occur at from the tweet.
   *
   * @return  array  The hashtag elements in the tweet.
   */
  public function extractHashtagsWithIndices() {
    preg_match_all(self::REGEX_HASHTAG, $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $m = &$matches[3];
    for ($i = 0; $i < count($m); $i++) {
      $m[$i] = array_combine(array('hashtag', 'indices'), $m[$i]);
      # XXX: Fix for PREG_OFFSET_CAPTURE returning byte offsets...
      $start = mb_strlen(substr($this->tweet, 0, $matches[1][$i][1]));
      $start += mb_strlen($matches[1][$i][0]);
      $length = mb_strlen($m[$i]['hashtag']);
      $m[$i]['indices'] = array($start, $start + $length + 1);
    }
    return $m;
  }

  /**
   * Extracts all the URLs and the indices they occur at from the tweet.
   *
   * @return  array  The URLs elements in the tweet.
   */
  public function extractURLsWithIndices() {
    preg_match_all(self::$REGEX_VALID_URL, $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $m = &$matches[2];
    for ($i = 0; $i < count($m); $i++) {
      $m[$i] = array_combine(array('url', 'indices'), $m[$i]);
      # XXX: Fix for PREG_OFFSET_CAPTURE returning byte offsets...
      $start = mb_strlen(substr($this->tweet, 0, $matches[1][$i][1]));
      $start += mb_strlen($matches[1][$i][0]);
      $length = mb_strlen($m[$i]['url']);
      $m[$i]['indices'] = array($start, $start + $length);
    }
    return $m;
  }

  /**
   * Extracts all the usernames and the indices they occur at from the tweet.
   *
   * @return  array  The username elements in the tweet.
   */
  public function extractMentionedUsernamesWithIndices() {
    preg_match_all(self::REGEX_USERNAME_MENTION, $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $m = &$matches[2];
    for ($i = 0; $i < count($m); $i++) {
      $m[$i] = array_combine(array('screen_name', 'indices'), $m[$i]);
      # XXX: Fix for PREG_OFFSET_CAPTURE returning byte offsets...
      $start = mb_strlen(substr($this->tweet, 0, $matches[1][$i][1]));
      $start += mb_strlen($matches[1][$i][0]);
      $length = mb_strlen($m[$i]['screen_name']);
      $m[$i]['indices'] = array($start, $start + $length + 1);
    }
    return $m;
  }

}