# -*- coding: utf-8 -*- #! /usr/bin/env python __thisfile__ = "http://www.jaist.ac.jp/~s1010205/email_extractor/email_extractor.py" """ Web Data Extractor, extract emails by sitecrawl Copyright (C) 2011 KATHURIA Pulkit Contact: pulkit@jaist.ac.jp Contributors: Open Source Sitemap Generator sitemap_gen by Vladimir Toncar http://toncar.cz/opensource/sitemap_gen.html This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import sys import re import commands from urllib import urlopen from collections import defaultdict import argparse import string import urllib2 import urlparse from HTMLParser import HTMLParser from HTMLParser import HTMLParseError import robotparser import httplib def getPage(url): try: f = urllib2.urlopen(url) page = "" for i in f.readlines(): page += i date = f.info().getdate('Last-Modified') if date == None: date = (0, 0, 0) else: date = date[:3] f.close() return (page, date, f.url) except urllib2.URLError, detail: pass return (None, (0,0,0), "") def joinUrls(baseUrl, newUrl): helpUrl, fragment = urlparse.urldefrag(newUrl) return urlparse.urljoin(baseUrl, helpUrl) def getRobotParser(startUrl): rp = robotparser.RobotFileParser() robotUrl = urlparse.urljoin(startUrl, "/robots.txt") page, date, url = getPage(robotUrl) if page == None: return None rp.parse(page) return rp class MyHTMLParser(HTMLParser): def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser): HTMLParser.__init__(self) self.pageMap = pageMap self.redirects = redirects self.baseUrl = baseUrl self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5 self.maxUrls = maxUrls self.blockExtensions = blockExtensions self.robotParser = robotParser def hasBlockedExtension(self, url): p = urlparse.urlparse(url) path = p[2].upper() # path attribute for i in self.blockExtensions: if path.endswith(i): return 1 return 0 def handle_starttag(self, tag, attrs): if len(self.pageMap) >= self.maxUrls: return if (tag.upper() == "BASE"): if (attrs[0][0].upper() == "HREF"): self.baseUrl = joinUrls(self.baseUrl, attrs[0][1]) if (tag.upper() == "A"): url = "" for attr in attrs: if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1): return elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1): url = joinUrls(self.baseUrl, attr[1]) if url == "": return if urlparse.urlsplit(url)[1] <> self.server: return if self.hasBlockedExtension(url) or self.redirects.count(url) > 0: return if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)): return if not(self.pageMap.has_key(url)): self.pageMap[url] = () def getUrlToProcess(pageMap): for i in pageMap.keys(): if pageMap[i] == (): return i return None def parsePages(startUrl, maxUrls, blockExtensions): pageMap = {} pageMap[startUrl] = () redirects = [] robotParser = getRobotParser(startUrl) while True: url = getUrlToProcess(pageMap) if url == None: break print " ", url page, date, newUrl = getPage(url) if page == None: del pageMap[url] elif url != newUrl: print newUrl del pageMap[url] pageMap[newUrl] = () redirects.append(url) else: pageMap[url] = date parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser) try: parser.feed(page) parser.close() except HTMLParseError: pass except UnicodeDecodeError: pass return pageMap def grab_email(text): found = [] mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}') for line in text: found.extend(mailsrch.findall(line)) u = {} for item in found: u[item] = 1 return u.keys() def urltext(url): viewsource = urlopen(url).readlines() return viewsource def crawl_site(url, limit): return parsePages(url, limit, 'None') if __name__ == '__main__': parser = argparse.ArgumentParser(add_help = True) parser = argparse.ArgumentParser(description= 'Web Email Extractor') parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl') parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com') myarguments = parser.parse_args() emails = defaultdict(int) for url in crawl_site(myarguments.url, myarguments.limit): for email in grab_email(urltext(url)): if not emails.has_key(email): print email emails[email] += 1