Simple module of codeigniter for Rss feed parsing Data

  •  copy LastRss.php in your model folder with model file 
  • Controller
  • controller.php
  • Module
  • model.php
  • Lastrss.php
  • View
  •  view.tpl(here Your view html file)

1) Controller(name :- admin.php )

 

<?php

class Admin extends MX_Controller
{

// set defaults
var $table = 'rss_items';                        // table to update
var $includes_path = '/includes/admin';                // path to includes for header and footer
var $redirect = '/admin/crawler/viewall';
var $objectID = 'id';                    // default unique ID
var $permissions = array();

function __construct()
{
parent::__construct();

// get siteID, if available
if (defined('SITEID'))
{
$this->siteID = SITEID;
}
//  load models and libs
$this->load->model('crawlers_model', 'crawler');

}

function index()
{
redirect($this->redirect);
}


function viewall()
{
$output = $this->crawler->crawl();

redirect('/admin/dashboard?m=1');
//redirect('/admin/crawler/sucessmessage');
}

/*function sucessmessage()
{

$this->load->view($this->includes_path.'/header');
$this->load->view('viewall',$output);
$this->load->view($this->includes_path.'/footer');
//redirect('/admin/crawler/viewall');
}*/

}
?>

2) modelfile(name:-Crawlers_model.php )

<?php // ------------------------------------------------------------------------ class Crawlers_model extends CI_Model { function __construct() { parent::__construct(); // get siteID, if available if (defined('SITEID')) { $this->siteID = SITEID; } } function get_rss_feeds() { $rss_feeds = $this->db->get('rss_feeds'); if($rss_feeds->num_rows() > 0) { return $rss_feeds; } else { return FALSE; } } function crawl() { //title,link,description,category,comments,guid,pubDate,dc:creator,wfw:commentRss,slash:comments,content:encoded $feeds = $this->get_rss_feeds(); if($feeds !== FALSE) { include('lastRSS.php'); $rss = new lastRSS; foreach($feeds->result() as $row) { //$crawl_url = $row->rss_url; //$crawl_url = 'http://www.modcloth.com/storefront/products/product_feed'; //$crawl_url = 'http://www.refinery29.com/index.xml'; //$crawl_url='http://www.bbcicecream.com/blog/feed/'; //$crawl_url= 'http://heartifb.com/feed/'; //encode images //$crawl_url = 'http://feeds.feedburner.com/highsnobette/rss'; //video $crawl_url = 'http://prescribednyc.tumblr.com/rss'; //print("Starting - $crawl_url <br />"); $main_category = $row->rss_category; $sub_category = $row->rss_subcategory; if($crawl_url != '' && $main_category != '' && $sub_category != '') { $feed = $rss->get($crawl_url); echo '<pre>'; print_r($feed); exit; if(!empty($feed['items'])) { foreach($feed['items'] as $item) { if(isset($item['description']) && $item['description'] != '') $item['description'] = $item['description']; else $item['description'] = ''; $title = utf8_decode(html_entity_decode(str_replace(array('<![CDATA[', ']]>'), '', $item['title']))); $description = utf8_decode(html_entity_decode(str_replace(array('<![CDATA[', ']]>'), '', $item['description']))); if(strstr($description, 'feedflare')) { $end_at = strpos($description, '<div>'); $description = substr($description, 0, $end_at - 1); } //Content if(isset($item['content:encoded']) && $item['content:encoded'] != '') $content_encoded = $item['content:encoded']; else $content_encoded = ''; //Content if(isset($item['image']) && $item['image'] != '') $image = $item['image']; else $image = ''; //media type if(strstr($description, 'youtub') || strstr($description, 'vimeo') || strstr($content_encoded, 'vimeo') || strstr($content_encoded, 'youtub')) { $media_type = 'video'; } else { $media_type = 'article'; } // link $link = $item['link']; //rss feed category if(isset($item['category']) && $item['category'] != '') $rssfeedcategory = $item['category']; else $rssfeedcategory = ''; if(isset($item['author']) && $item['author'] != '') $author = $item['author']; else $author = ''; if(isset($item['guid']) && $item['guid'] != '') $guid = $item['guid']; else $guid = ''; //check if title is blank or not if(isset($item['pubDate']) && $item['pubDate'] != '') $pub_date = $item['pubDate']; else $pub_date = ''; $pub_date = strtotime($pub_date); $this->db->where('url', $link); $exists_query = $this->db->get('rss_items'); //check if title is blank or not if(isset($title) && $title != '') $title = $title; else $title = ''; //check if title is Desc or not if(isset($description) && $description != '') $description = $description; else $description = ''; //check if link is Desc or not if(isset($link) && $link != '') $link = $link; else $link = ''; //check if link is Desc or not if(isset($image) && $image != '') $image = $image; else $image = ''; if($exists_query->num_rows() <= 0) { $this->db->insert('ha_rss_items', array( 'title' => $title, 'description' => $description, 'content_encoded' => $content_encoded, 'url' => $link, 'parent_rss' => $crawl_url, 'parent_category' => $main_category, 'parent_subcategory' => $sub_category, 'rssfeed_category' => $rssfeedcategory, 'rssfeed_author' => $author, 'guid' => $guid, 'pub_date' => $pub_date, 'media_type' => $media_type, 'image' => $image )); } } } } } } } }
?>

3) Lastrss.php

<?php class lastRSS { // ------------------------------------------------------------------- // Public properties // ------------------------------------------------------------------- var $default_cp = 'UTF-8'; var $CDATA = 'content'; var $cp = ''; var $items_limit = 0; var $stripHTML = False; var $date_format = ''; var $cache_dir = ''; // ------------------------------------------------------------------- // Private variables // ------------------------------------------------------------------- var $channeltags = array( 'title', 'link', 'description', 'language', 'copyright', 'managingEditor', 'webMaster', 'lastBuildDate', 'rating', 'docs' ); var $itemtags = array( 'title', 'link', 'description', 'author', 'category', 'comments', 'enclosure', 'guid', 'pubDate', 'dc:creator', 'media:content', 'media:title', 'wfw:commentRss', 'slash:comments', 'content:encoded', 'source' ); var $imagetags = array( 'title', 'url', 'link', 'width', 'height' ); var $textinputtags = array( 'title', 'description', 'name', 'link' ); // ------------------------------------------------------------------- // Parse RSS file and returns associative array. // ------------------------------------------------------------------- function Get($rss_url) { // If CACHE ENABLED if ($this->cache_dir != '') { $cache_file = $this->cache_dir . '/rsscache_' . md5($rss_url); $timedif = @(time() - filemtime($cache_file)); if ($timedif < $this->cache_time) { // cached file is fresh enough, return cached array $result = unserialize(join('', file($cache_file))); // set 'cached' to 1 only if cached file is correct if ($result) $result['cached'] = 1; } else { // cached file is too old, create new $result = $this->Parse($rss_url); $serialized = serialize($result); if ($f = @fopen($cache_file, 'w')) { fwrite($f, $serialized, strlen($serialized)); fclose($f); } if ($result) $result['cached'] = 0; } } // If CACHE DISABLED >> load and parse the file directly else { $result = $this->Parse($rss_url); if ($result) $result['cached'] = 0; } // return result return $result; } // ------------------------------------------------------------------- // Modification of preg_match(); return trimed field with index 1 // from 'classic' preg_match() array output // ------------------------------------------------------------------- function my_preg_match($pattern, $subject) { // start regullar expression preg_match($pattern, $subject, $out); // if there is some result... process it and return it if (isset($out[1])) { // Process CDATA (if present) if ($this->CDATA == 'content') { // Get CDATA content (without CDATA tag) $out[1] = strtr($out[1], array( '<![CDATA[' => '', ']]>' => '' )); } elseif ($this->CDATA == 'strip') { // Strip CDATA $out[1] = strtr($out[1], array( '<![CDATA[' => '', ']]>' => '' )); } // If code page is set convert character encoding to required if ($this->cp != '') // $out[1] = $this->MyConvertEncoding($this->rsscp, $this->cp, $out[1]); $out[1] = iconv($this->rsscp, $this->cp . '//TRANSLIT', $out[1]); // Return result return trim($out[1]); } else { // if there is NO result, return empty string return ''; } } // ------------------------------------------------------------------- // Replace HTML entities &something; by real characters // ------------------------------------------------------------------- function unhtmlentities($string) { // Get HTML entities table $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_QUOTES); // Flip keys<==>values $trans_tbl = array_flip($trans_tbl); // Add support for &apos; entity (missing in HTML_ENTITIES) $trans_tbl+= array( '&apos;' => "'" ); // Replace entities by values return strtr($string, $trans_tbl); } // ------------------------------------------------------------------- // Parse() is private method used by Get() to load and parse RSS file. // Don't use Parse() in your scripts - use Get($rss_file) instead. // ------------------------------------------------------------------- function Parse($rss_url) { // Open and load RSS file if ($f = @fopen($rss_url, 'r')) { $rss_content = ''; while (!feof($f)) { $rss_content.= fgets($f, 4096); } fclose($f); // Parse document encoding $result['encoding'] = $this->my_preg_match("'encoding=[\'\"](.*?)[\'\"]'si", $rss_content); // if document codepage is specified, use it if ($result['encoding'] != '') { $this->rsscp = $result['encoding']; } // otherwise use the default codepage else { $this->rsscp = $this->default_cp; } // This is used in my_preg_match() // Parse CHANNEL info preg_match("'<channel.*?>(.*?)</channel>'si", $rss_content, $out_channel); foreach($this->channeltags as $channeltag) { $temp = $this->my_preg_match("'<$channeltag.*?>(.*?)</$channeltag>'si", $out_channel[1]); if ($temp != '') $result[$channeltag] = $temp; // Set only if not empty } // If date_format is specified and lastBuildDate is valid if ($this->date_format != '' && ($timestamp = strtotime($result['lastBuildDate'])) !== - 1) { // convert lastBuildDate to specified date format $result['lastBuildDate'] = date($this->date_format, $timestamp); } // Parse TEXTINPUT info preg_match("'<textinput(|[^>]*[^/])>(.*?)</textinput>'si", $rss_content, $out_textinfo); // This a little strange regexp means: // Look for tag <textinput> with or without any attributes, but skip truncated version <textinput /> (it's not beggining tag) if (isset($out_textinfo[2])) { foreach($this->textinputtags as $textinputtag) { $temp = $this->my_preg_match("'<$textinputtag.*?>(.*?)</$textinputtag>'si", $out_textinfo[2]); if ($temp != '') $result['textinput_' . $textinputtag] = $temp; // Set only if not empty } } // Parse IMAGE info preg_match("'<image.*?>(.*?)</image>'si", $rss_content, $out_imageinfo); if (isset($out_imageinfo[1])) { foreach($this->imagetags as $imagetag) { $temp = $this->my_preg_match("'<$imagetag.*?>(.*?)</$imagetag>'si", $out_imageinfo[1]); if ($temp != '') $result['image_' . $imagetag] = $temp; // Set only if not empty } } // Parse ITEMS preg_match_all("'<item(| .*?)>(.*?)</item>'si", $rss_content, $items); $rss_items = $items[2]; $i = 0; $result['items'] = array(); // create array even if there are no items foreach($rss_items as $rss_item) { // If number of items is lower then limit: Parse one item if ($i < $this->items_limit || $this->items_limit == 0) { foreach($this->itemtags as $itemtag) { $temp = $this->my_preg_match("'<$itemtag.*?>(.*?)</$itemtag>'si", $rss_item); if ($temp != '') $result['items'][$i][$itemtag] = $temp; // Set only if not empty } // Strip HTML tags and other bullshit from DESCRIPTION if ($this->stripHTML && $result['items'][$i]['description']) $result['items'][$i]['description'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['description']))); // Strip HTML tags and other bullshit from TITLE if ($this->stripHTML && $result['items'][$i]['title']) $result['items'][$i]['title'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['title']))); // If date_format is specified and pubDate is valid if ($this->date_format != '' && ($timestamp = strtotime($result['items'][$i]['pubDate'])) !== - 1) { // convert pubDate to specified date format $result['items'][$i]['pubDate'] = date($this->date_format, $timestamp); } if (isset($result['items'][$i]['description'])) { preg_match_all('/<img .*?(?=src)src=\"([^\"]+)\"/si', stripslashes($result['items'][$i]['description']) , $image); if (count($image[1])) { $result['items'][$i]['image'] = array_unique($image[1]); } if (preg_match_all('~|~', html_entity_decode($result['items'][$i]['content:encoded']) , $output)) { if (count($output[0])) { $result['items'][$i]['video'] = $output[0]; } } } // Item counter $i++; } } $result['items_count'] = $i; return $result; } else // Error in opening return False { return False; } } } ?>
Advertisements