Untitled

From Anonymous, 3 Years ago, written in Plain Text, viewed 60 times.
URL https://paste.bugabuse.net/view/28010503 Embed
Download Paste or View Raw
  1. # -*- coding: utf-8 -*-
  2. #! /usr/bin/env python
  3. __thisfile__ = "http://www.jaist.ac.jp/~s1010205/email_extractor/email_extractor.py"
  4. """
  5.     Web Data Extractor, extract emails by sitecrawl
  6.     Copyright (C) 2011 KATHURIA Pulkit
  7.     Contact: pulkit@jaist.ac.jp
  8.  
  9.     Contributors:
  10.         Open Source Sitemap Generator sitemap_gen by Vladimir Toncar
  11.         http://toncar.cz/opensource/sitemap_gen.html
  12.  
  13.     This program is free software; you can redistribute it and/or modify
  14.     it under the terms of the GNU General Public License as published by
  15.     the Free Software Foundation; either version 3 of the License, or
  16.     (at your option) any later version.
  17.  
  18.     This program is distributed in the hope that it will be useful,
  19.     but WITHOUT ANY WARRANTY; without even the implied warranty of
  20.     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21.     GNU General Public License for more details.
  22.  
  23.     You should have received a copy of the GNU General Public License
  24.     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  25. """
  26. import sys
  27. import re
  28. import commands
  29. from urllib import urlopen
  30. from collections import defaultdict
  31. import argparse
  32. import string
  33. import urllib2
  34. import urlparse
  35. from HTMLParser import HTMLParser
  36. from HTMLParser import HTMLParseError
  37. import robotparser
  38. import httplib
  39.  
  40. def getPage(url):
  41.     try:
  42.         f = urllib2.urlopen(url)
  43.         page = ""
  44.         for i in f.readlines():
  45.             page += i
  46.         date = f.info().getdate('Last-Modified')
  47.         if date == None:
  48.             date = (0, 0, 0)
  49.         else:
  50.             date = date[:3]
  51.         f.close()
  52.         return (page, date, f.url)
  53.     except urllib2.URLError, detail:
  54.         pass
  55.         return (None, (0,0,0), "")
  56.  
  57. def joinUrls(baseUrl, newUrl):
  58.         helpUrl, fragment = urlparse.urldefrag(newUrl)
  59.         return urlparse.urljoin(baseUrl, helpUrl)
  60.  
  61. def getRobotParser(startUrl):
  62.         rp = robotparser.RobotFileParser()
  63.         robotUrl = urlparse.urljoin(startUrl, "/robots.txt")
  64.         page, date, url = getPage(robotUrl)
  65.  
  66.         if page == None:
  67.             return None
  68.         rp.parse(page)
  69.         return rp
  70.  
  71. class MyHTMLParser(HTMLParser):
  72.     def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
  73.         HTMLParser.__init__(self)
  74.         self.pageMap = pageMap
  75.         self.redirects = redirects
  76.         self.baseUrl = baseUrl
  77.         self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5
  78.         self.maxUrls = maxUrls
  79.         self.blockExtensions = blockExtensions
  80.         self.robotParser = robotParser
  81.     def hasBlockedExtension(self, url):
  82.         p = urlparse.urlparse(url)
  83.         path = p[2].upper() # path attribute
  84.         for i in self.blockExtensions:
  85.             if path.endswith(i):
  86.                 return 1
  87.         return 0
  88.     def handle_starttag(self, tag, attrs):
  89.         if len(self.pageMap) >= self.maxUrls:
  90.             return
  91.         if (tag.upper() == "BASE"):
  92.             if (attrs[0][0].upper() == "HREF"):
  93.                 self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
  94.         if (tag.upper() == "A"):
  95.             url = ""
  96.             for attr in attrs:
  97.                 if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
  98.                     return  
  99.                 elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
  100.                     url = joinUrls(self.baseUrl, attr[1])
  101.             if url == "": return
  102.             if urlparse.urlsplit(url)[1] <> self.server:
  103.                 return
  104.             if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
  105.                 return
  106.             if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)):
  107.                 return
  108.             if not(self.pageMap.has_key(url)):
  109.                 self.pageMap[url] = ()
  110.  
  111. def getUrlToProcess(pageMap):
  112.     for i in pageMap.keys():
  113.         if pageMap[i] == ():
  114.             return i
  115.     return None
  116.  
  117. def parsePages(startUrl, maxUrls, blockExtensions):
  118.     pageMap = {}
  119.     pageMap[startUrl] = ()
  120.     redirects = []
  121.     robotParser = getRobotParser(startUrl)
  122.     while True:
  123.         url = getUrlToProcess(pageMap)
  124.         if url == None:
  125.             break
  126.         print " ", url
  127.         page, date, newUrl = getPage(url)
  128.         if page == None:
  129.             del pageMap[url]
  130.         elif url != newUrl:
  131.             print newUrl
  132.             del pageMap[url]
  133.             pageMap[newUrl] = ()
  134.             redirects.append(url)
  135.         else:
  136.             pageMap[url] = date
  137.             parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
  138.             try:
  139.                 parser.feed(page)
  140.                 parser.close()
  141.             except HTMLParseError:
  142.                 pass
  143.             except UnicodeDecodeError:
  144.                 pass
  145.     return pageMap
  146.  
  147. def grab_email(text):
  148.     found = []
  149.     mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
  150.     for line in text:
  151.         found.extend(mailsrch.findall(line))    
  152.     u = {}
  153.     for item in found:
  154.         u[item] = 1
  155.     return u.keys()
  156.  
  157. def urltext(url):
  158.     viewsource = urlopen(url).readlines()
  159.     return viewsource
  160.  
  161. def crawl_site(url, limit):
  162.     return parsePages(url, limit, 'None')
  163.  
  164. if __name__ == '__main__':
  165.     parser = argparse.ArgumentParser(add_help = True)
  166.     parser = argparse.ArgumentParser(description= 'Web Email Extractor')
  167.     parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl')
  168.     parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com')
  169.     myarguments = parser.parse_args()
  170.     emails = defaultdict(int)
  171.     for url in crawl_site(myarguments.url, myarguments.limit):
  172.         for email in grab_email(urltext(url)):
  173.             if not emails.has_key(email): print email
  174.             emails[email] += 1
  175.  
  176.        
  177.  
  178.  

Reply to "Untitled"

Here you can reply to the paste above