Python简易爬虫 – 抓取糗事百科糗事

pexels-photo-330771.jpeg

这是之前博客里的一个简易爬虫代码,忘记导出博客了,所以从搜索引擎快照里扒拉了出来,毕竟这是自己的第一次python爬虫尝试。

 1 
 2 import urllib.request
 3 import urllib.parse
 4 import time
 5 import os
 6 import threading
 7 import queue
 8 import bs4
 9 from bs4 import BeautifulSoup
10 import shutil
11 import errno
12 import sys
13 
14 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
15                 'Accept':'text/html;q=0.9,*/*;q=0.8',
16                 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
17                 'Accept-Encoding':'gzip',
18                 'Connection':'close',
19                 'Referer':None #注意如果依然不能抓取,这里可以设置抓取网站的host
20                 }
21 
22 #############################################################
23 # working thread   
24 class Worker(threading.Thread):
25        worker_count = 0
26 
27        def __init__( self, workQueue, resultQueue, timeout = 0, **kwds):
28            threading.Thread.__init__( self, **kwds )
29            self.id = Worker.worker_count
30            Worker.worker_count += 1
31            self.setDaemon( True )
32            self.workQueue = workQueue
33            self.resultQueue = resultQueue
34            self.timeout = timeout
35            self.start()
36 
37        def run( self ):
38            ''' the get-some-work, do-some-work main loop of worker threads '''
39            while True:
40               try:
41                   callable, args, kwds = self.workQueue.get()
42                   res = callable(*args, **kwds)
43                   #print("worker[%2d]: %s" % (self.id, str(res) ))
44                   #self.resultQueue.put( res )
45               except queue.Empty:
46                   pass
47                   #break
48               except:
49                   print('worker[%2d]' % self.id, sys.exc_info()[:2])   
50                      
51 class WorkerManager:
52        def __init__(self, num_of_workers=10, timeout = 1):
53            self.workQueue = queue.Queue()
54            self.resultQueue = queue.Queue()
55            self.workers = []
56            self.timeout = timeout
57            self._recruitThreads(num_of_workers)
58 
59        def _recruitThreads(self, num_of_workers):
60           for i in range(num_of_workers):
61               worker = Worker(self.workQueue, self.resultQueue, self.timeout)
62               self.workers.append(worker)
63 
64        def wait_for_complete(self):
65            # ...then, wait for each of them to terminate:
66            while len(self.workers):
67                worker = self.workers.pop()
68                worker.join()
69                if worker.isAlive() and not self.workQueue.empty():
70                    self.workers.append( worker )
71            print("All jobs are are completed.")
72 
73        def add_job( self, callable, *args, **kwds ):
74            self.workQueue.put( (callable, args, kwds) )
75 
76        def get_result( self, *args, **kwds ):
77            return self.resultQueue.get( *args, **kwds )  
78 
79 
80 
81 #############################################################
82 
83 
84 class Fetcher:
85        def __init__(self,manager):
86            self.opener = urllib.request.build_opener(urllib.request.HTTPHandler)
87            self.opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
88            self.lock = threading.Lock() #线程锁
89            self.q_req = queue.Queue() #任务队列
90            self.q_ans = queue.Queue() #完成队列
91            self.__q_retry = queue.Queue() #重试队列
92            #self.threads_num = threads_num + 1 # 1代表了重试线程
93            #self.__threads = []
94 
95            #重试线程
96            manager.add_job(self.threadretry,self)
97            #self.__retry_thread = threading.Thread(target=self.threadretry)
98            #self.__threads.append(self.__retry_thread)
99            #self.__retry_thread.setDaemon(True)
100            #self.__retry_thread.start()
101 
102            #开启@threads_num个工作线程
103            manager.add_job(self.threadget,self)
104            #for i in range(threads_num):
105            #   t = threading.Thread(target=self.threadget)
106            #   self.__threads.append(t)
107            #   t.setDaemon(True)
108            #   t.start()
109                
110            self.running = 0
111 
112        #def __del__(self): #解构时需等待两个队列完成
113            #print("will delete")
114            #self.q_req.join()
115            #self.q_ans.join()
116            #self.__q_retry.join()
117            #print("deleted")
118            #for i in range(self.threads_num+1):
119            #    self.__threads[i].exit()
120            #print("还有"+str(threading.active_count())+"个活跃线程")
121 
122        def taskleft(self):
123            return self.q_req.qsize()+self.q_ans.qsize()+self.running
124 
125        def push(self,req):
126            self.q_req.put(req)
127 
128        def pop(self):
129            return self.q_ans.get()
130 
131        def threadretry(self,_self):
132            while True:
133                try:
134                    req = _self.__q_retry.get(timeout=1) #get(self,block=True,timeout=None)
135                except queue.Empty:
136                    break
137                
138                #with _self.lock:
139                _self.running += 1
140 
141                try:
142                    ans = _self.opener.open(req).read()
143 
144                except urllib.error.URLError as e:
145                    ans = ''
146                    if hasattr(e, 'reason'):
147                        print('We failed to reach a server.')
148                        print('Reason: ', e.reason)
149                    elif hasattr(e, 'code'):
150                        print('The server cannot fulfill the request.')
151                        print('Reason: ', e.code)
152                else:
153                    if ans:
154                        _self.q_ans.put((req,ans))
155                finally:
156                    #with _self.lock:
157                    _self.running -= 1
158 
159                    _self.__q_retry.task_done()
160                
161 
162 
163        def threadget(self,_self):
164            while True:
165                try:
166                    req = _self.q_req.get(timeout=1)
167                except queue.Empty:
168                    break
169                #with _self.lock: #要保证该操作的原子性,进入critical area
170                _self.running += 1
171                    
172                try:
173                    ans = _self.opener.open(req).read()
174                except urllib.error.URLError as e:
175                    ans = ''
176                    if hasattr(e, 'reason'):
177                        print('We failed to reach a server.')
178                        print('Reason: ', e.reason)
179                        _self.__q_retry.put(req)
180                    elif hasattr(e, 'code'):
181                        print('The server cannot fulfill the request.')
182                        print('Reason: ', e.code)
183                        _self.__q_retry.put(req)
184                else:
185                    if ans:
186                        _self.q_ans.put((req,ans))
187                    else:
188                        _self.__q_retry.put(req)
189                finally:
190                    #with _self.lock:
191                    _self.running -= 1
192                    _self.q_req.task_done()
193                
194 
195 def create_dir(userid,domain='qiushibaike'):
196        dir_name = domain + '/' + userid
197        try:
198            os.mkdir(dir_name)
199        except OSError as e:
200            if e.errno == errno.EEXIST and os.path.isdir(dir_name):
201                pass
202            else:
203                print(str(e))
204 
205 def userid_exist(userid):
206        return os.path.isdir('qiushibaike'+'/'+userid)
207 
208 def get_file_name(userid):
209        current_time = time.strftime("%Y-%m-%d",time.localtime())
210        return 'qiushibaike'+'/'+userid+'/'+current_time+'.txt'
211 
212 
213 
214 def write_file(file,soup):
215        count=0
216        for ii in soup.find_all("div",class_="content clearfix"):
217            #print(ii.a["href"])
218            #print(ii.a.text)
219            if ii.a.text:
220                count += 1
221                file.write(bytes(ii.a["href"],encoding="utf-8"))
222                file.write(bytes('rn',encoding="utf-8"))
223                file.write(bytes(ii.a.text,encoding="utf-8"))
224                file.write(bytes("rnrn",encoding="utf-8"))
225        return count
226        
227 def get_max_page(soup):
228        #ii=bs4.element.Tag()
229        num=0
230        for jj in soup.find_all('a',rel="next",class_=None):
231            num=int(jj.text)
232        return num
233 
234                
235 def store_this_user(userid,manager):
236        if userid_exist(userid):
237           print("该用户貌似已经检索")
238           return
239        create_dir(userid)
240        
241        file_name = get_file_name(userid)
242        file = open(file_name, 'wb')
243        
244        ff = Fetcher(manager)
245        ff.push('http://www.qiushibaike.com/users/'+userid)
246        req,ans = ff.pop()
247        
248        soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
249        user_name=""
250        for ii in soup.find_all('span',class_="user_center"):
251            user_name = ii.text
252        if not user_name:
253            del ff
254            file.close()
255            return
256        
257        #把第一页的写进文件
258        count = write_file(file,soup)
259        print(user_name+" "+str(count)+"条糗事 [http://www.qiushibaike.com/users/"+userid+"/articles/page/1]")
260 
261        #把余下的页面请求完
262        max_page = get_max_page(soup)+1
263        for i in range(2,max_page):
264            #print("加入列表 [http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i)+"]")
265            ff.push("http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i))
266 
267        while ff.taskleft():
268            req,ans = ff.pop()
269            soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
270            count = write_file(file,soup)
271            print(user_name+" "+str(count)+"条糗事 ["+req+"]")
272 
273        print(user_name+" 的线程资源被删除")
274        del ff
275        file.close()
276        return
277 
278        
279 
280 def main():
281        #os.mkdir('qiushibaike')
282        #store_this_user("13843355")
283 
284        opener = urllib.request.build_opener(urllib.request.HTTPHandler)
285        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
286        
287 
288        worker_manager = WorkerManager(128)
289        #manager = WorkerManager(128)
290 
291        #ff = Fetcher(worker_manager)
292        next_link = 'http://www.qiushibaike.com'
293        #ff.push(next_link)
294        
295 
296        page_num=0
297        
298        while True:
299            page_num += 1
300            print("自动翻到第"+str(page_num)+"页 " + next_link)
301            ans = opener.open(next_link).read()
302            next_link=""
303            #req,ans = ff.pop()
304            #print("adfsdfsdfsdfds")
305            soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
306 
307            for ii in soup.find_all('a',class_="next",text="下一页"):
308                next_link = ii["href"]
309            if not next_link:
310                print("what the fuck!!!")
311                break
312            next_link = "http://www.qiushibaike.com"+next_link
313            #ff.push(next_link)
314 
315            for ii in soup.find_all('div',class_="author"):
316                print(ii.a["href"].split('/')[2])
317                store_this_user(ii.a["href"].split('/')[2], worker_manager)
318        
319 
320 
321 '''
322 
323        file.close()
324 
325 
326        ff = Fetcher(10)
327        ff.push('http://www.qiushibaike.com/users/14870461')
328        req,ans = ff.pop()
329        
330        print(ans.decode('utf8'))
331        
332 
333        #os.system("pause")
334     
335 
336 
337        testgbk='汉字'
338        testunit=testgbk.encode('gbk')    #--汉字解码
339        print(testunit)
340        
341        testutf8=testgbk.encode('utf-8')  #--转utf-8编码
342        print(testutf8)
343        
344        testunit=testutf8.decode('utf-8')  #--utf-8解码
345        print(testunit)
346        
347        testgbk=testunit.encode('gbk')    #--转gbk编码
348        print(testgbk)
349 '''
350 
351 
352        
353 '''
354        links = ['http://item.jd.com/%d.html'%i for i in range(1746854,1746860)]
355        ff = Fetcher(10)
356        for url in links:
357            ff.push(url)
358 
359        while ff.taskleft():
360            (url,content) = ff.pop()
361            print(url,len(content))
362 '''
363 
364        
365 
366 '''
367 url = 'http://www.sina.com'
368 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
369 values = {'name' : 'Michael Foord', 
370              'location' : 'pythontab', 
371              'language' : 'Python' }
372 headers = { 'User-Agent' : user_agent }
373 
374 data = urllib.parse.urlencode(values) 
375 #req = urllib.request.Request(url, data, headers)
376 req = urllib.request.Request('http://www.baidu.com')
377 req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
378 '''
379 
380 
381 '''
382 ##################################
383 def cbk(a, b, c):
384        
385        #回调函数
386        #@a: 已经下载的数据块
387        #@b: 数据块的大小
388        #@c: 远程文件的大小
389        
390        per = 100.0 * a * b / c
391        if per > 100:
392            per = 100
393        num = int(per)
394        
395     
396        print('[',end='')
397        for i in range(num):
398            print('#',end='')
399        print('%.2f]' %(per), end='')
400 
401 url = 'http://www.sina.com.cn'
402 local = 'e:\sina.html'
403 urllib.request.urlretrieve(url, local, cbk)
404 
405 input()
406 os.system("pause")
407 ##################################
408 '''
409 
410 '''
411 try:
412        response = urllib.request.urlopen(req)
413        print('ffdfsdfsf')
414 except urllib.error.URLError as e:
415        if hasattr(e, 'reason'):
416            print('We failed to reach a server.')
417            print('Reason: ', e.reason)
418        elif hasattr(e, 'code'):
419            print('The server cannot fulfill the request.')
420            print('Reason: ', e.code)
421 else:
422            #print(response.info)
423            #print(response.getcode())
424            response_context = response.read()
425            print(response_context.decode("utf8"))
426 '''
427 
428 
429 if __name__=="__main__":
430        main()
Advertisements

分类:默认目录

发表评论

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  更改 )

Google+ photo

You are commenting using your Google+ account. Log Out /  更改 )

Twitter picture

You are commenting using your Twitter account. Log Out /  更改 )

Facebook photo

You are commenting using your Facebook account. Log Out /  更改 )

w

Connecting to %s