Hướng dẫn python singleton class

import httplib2

import os

import re

import threading

import urllib

import urllib.request

from urllib.parse import urlparse, urljoin

from bs4 import BeautifulSoup

class CrawlerSingleton[object]:

    def __new__[cls]:

        if not hasattr[cls, 'instance']:

            cls.instance = super[CrawlerSingleton, cls].__new__[cls]

        return cls.instance

def navigate_site[max_links = 5]:

    parser_crawlersingleton = CrawlerSingleton[]

    while parser_crawlersingleton.url_queue:

        if len[parser_crawlersingleton.visited_url] == max_links:

            return

        url = parser_crawlersingleton.url_queue.pop[]

        http = httplib2.Http[]

        try:

            status, response = http.request[url]

        except Exception:

            continue

        parser_crawlersingleton.visited_url.add[url]

        print[url]

        bs = BeautifulSoup[response, "html.parser"]

        for link in BeautifulSoup.findAll[bs, 'a']:

            link_url = link.get['href']

            if not link_url:

                continue

            parsed = urlparse[link_url]

            if parsed.netloc and parsed.netloc != parsed_url.netloc:

                continue

            scheme = parsed_url.scheme

            netloc = parsed.netloc or parsed_url.netloc

            path = parsed.path

            link_url = scheme +'://' +netloc + path

            if link_url in parser_crawlersingleton.visited_url:

                continue

            parser_crawlersingleton.url_queue = [link_url] +\

                                                parser_crawlersingleton.url_queue

class ParallelDownloader[threading.Thread]:

    def __init__[self, thread_id, name, counter]:

        threading.Thread.__init__[self]

        self.name = name

    def run[self]:

        print['Starting thread', self.name]

        download_images[self.name]

        print['Finished thread', self.name]

def download_images[thread_name]:

    singleton = CrawlerSingleton[]

    while singleton.visited_url:

        url = singleton.visited_url.pop[]

        http = httplib2.Http[]

        print[thread_name, 'Downloading images from', url]

        try:

            status, response = http.request[url]

        except Exception:

            continue

        bs = BeautifulSoup[response, "html.parser"]

        images = BeautifulSoup.findAll[bs, 'img']

        for image in images:

            src = image.get['src']

            src = urljoin[url, src]

            basename = os.path.basename[src]

            print['basename:', basename]

            if basename != '':

                if src not in singleton.image_downloaded:

                    singleton.image_downloaded.add[src]

                    print['Downloading', src]

                    urllib.request.urlretrieve[src, os.path.join['images', basename]]

                    print[thread_name, 'finished downloading images from', url]

def main[]:

    crwSingltn = CrawlerSingleton[]

    crwSingltn.url_queue = [main_url]

    crwSingltn.visited_url = set[]

    crwSingltn.image_downloaded = set[]

    navigate_site[]

    if not os.path.exists['images']:

        os.makedirs['images']

    thread1 = ParallelDownloader[1, "Thread-1", 1]

    thread2 = ParallelDownloader[2, "Thread-2", 2]

    thread1.start[]

    thread2.start[]

if __name__ == "__main__":

    parsed_url = urlparse[main_url]

    main[]

Chủ Đề