import httplib2
import
httplib2
import os
os
import re
re
import threading
threading
import urllib
urllib
import urllib.request
urllib.request
from urllib.parse import urlparse, urljoin
from
urllib.parse
urlparse, urljoin
from bs4 import BeautifulSoup
bs4
BeautifulSoup
class CrawlerSingleton(object):
class
CrawlerSingleton(
object
):
def __new__(cls):
def
__new__(
cls
if not hasattr(cls, 'instance'):
if
not
hasattr
(
,
'instance'
cls.instance = super(CrawlerSingleton, cls).__new__(cls)
.instance
=
super
(CrawlerSingleton,
).__new__(
)
return cls.instance
return
def navigate_site(max_links = 5):
navigate_site(max_links
5
parser_crawlersingleton = CrawlerSingleton()
parser_crawlersingleton
CrawlerSingleton()
while parser_crawlersingleton.url_queue:
while
parser_crawlersingleton.url_queue:
if len(parser_crawlersingleton.visited_url) == max_links:
len
(parser_crawlersingleton.visited_url)
max_links:
url = parser_crawlersingleton.url_queue.pop()
url
parser_crawlersingleton.url_queue.pop()
http = httplib2.Http()
http
httplib2.Http()
try:
try
:
status, response = http.request(url)
status, response
http.request(url)
except Exception:
except
Exception:
continue
parser_crawlersingleton.visited_url.add(url)
print(url)
print
(url)
bs = BeautifulSoup(response, "html.parser")
bs
BeautifulSoup(response,
"html.parser"
for link in BeautifulSoup.findAll(bs, 'a'):
for
link
in
BeautifulSoup.findAll(bs,
'a'
link_url = link.get('href')
link_url
link.get(
'href'
if not link_url:
link_url:
parsed = urlparse(link_url)
parsed
urlparse(link_url)
if parsed.netloc and parsed.netloc != parsed_url.netloc:
parsed.netloc
and
parsed.netloc !
parsed_url.netloc:
scheme = parsed_url.scheme
scheme
parsed_url.scheme
netloc = parsed.netloc or parsed_url.netloc
netloc
or
parsed_url.netloc
path = parsed.path
path
parsed.path
link_url = scheme +'://' +netloc + path
+
'://'
if link_url in parser_crawlersingleton.visited_url:
parser_crawlersingleton.visited_url:
parser_crawlersingleton.url_queue = [link_url] +\
parser_crawlersingleton.url_queue
[link_url]
\
class ParallelDownloader(threading.Thread):
ParallelDownloader(threading.Thread):
def __init__(self, thread_id, name, counter):
__init__(
self
, thread_id, name, counter):
threading.Thread.__init__(self)
threading.Thread.__init__(
self.name = name
.name
name
def run(self):
run(
print('Starting thread', self.name)
'Starting thread'
.name)
download_images(self.name)
download_images(
print('Finished thread', self.name)
'Finished thread'
def download_images(thread_name):
download_images(thread_name):
singleton = CrawlerSingleton()
singleton
while singleton.visited_url:
singleton.visited_url:
url = singleton.visited_url.pop()
singleton.visited_url.pop()
print(thread_name, 'Downloading images from', url)
(thread_name,
'Downloading images from'
, url)
images = BeautifulSoup.findAll(bs, 'img')
images
'img'
for image in images:
image
images:
src = image.get('src')
src
image.get(
'src'
src = urljoin(url, src)
urljoin(url, src)
basename = os.path.basename(src)
basename
os.path.basename(src)
print('basename:', basename)
'basename:'
, basename)
if basename != '':
basename !
'':
if src not in singleton.image_downloaded:
singleton.image_downloaded:
singleton.image_downloaded.add(src)
print('Downloading', src)
'Downloading'
, src)
urllib.request.urlretrieve(src, os.path.join('images', basename))
urllib.request.urlretrieve(src, os.path.join(
'images'
, basename))
print(thread_name, 'finished downloading images from', url)
'finished downloading images from'
def main():
main():
crwSingltn = CrawlerSingleton()
crwSingltn
crwSingltn.url_queue = [main_url]
crwSingltn.url_queue
[main_url]
crwSingltn.visited_url = set()
crwSingltn.visited_url
set
()
crwSingltn.image_downloaded = set()
crwSingltn.image_downloaded
navigate_site()
if not os.path.exists('images'):
os.path.exists(
os.makedirs('images')
os.makedirs(
thread1 = ParallelDownloader(1, "Thread-1", 1)
thread1
ParallelDownloader(
1
"Thread-1"
thread2 = ParallelDownloader(2, "Thread-2", 2)
thread2
2
"Thread-2"
thread1.start()
thread2.start()
if __name__ == "__main__":
__name__
"__main__"
parsed_url = urlparse(main_url)
parsed_url
urlparse(main_url)
main()