这是之前博客里的一个简易爬虫代码,忘记导出博客了,所以从搜索引擎快照里扒拉了出来,毕竟这是自己的第一次python爬虫尝试。
1 2 import urllib.request 3 import urllib.parse 4 import time 5 import os 6 import threading 7 import queue 8 import bs4 9 from bs4 import BeautifulSoup 10 import shutil 11 import errno 12 import sys 13 14 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 15 'Accept':'text/html;q=0.9,*/*;q=0.8', 16 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 17 'Accept-Encoding':'gzip', 18 'Connection':'close', 19 'Referer':None #注意如果依然不能抓取,这里可以设置抓取网站的host 20 } 21 22 ############################################################# 23 # working thread 24 class Worker(threading.Thread): 25 worker_count = 0 26 27 def __init__( self, workQueue, resultQueue, timeout = 0, **kwds): 28 threading.Thread.__init__( self, **kwds ) 29 self.id = Worker.worker_count 30 Worker.worker_count += 1 31 self.setDaemon( True ) 32 self.workQueue = workQueue 33 self.resultQueue = resultQueue 34 self.timeout = timeout 35 self.start() 36 37 def run( self ): 38 ''' the get-some-work, do-some-work main loop of worker threads ''' 39 while True: 40 try: 41 callable, args, kwds = self.workQueue.get() 42 res = callable(*args, **kwds) 43 #print("worker[%2d]: %s" % (self.id, str(res) )) 44 #self.resultQueue.put( res ) 45 except queue.Empty: 46 pass 47 #break 48 except: 49 print('worker[%2d]' % self.id, sys.exc_info()[:2]) 50 51 class WorkerManager: 52 def __init__(self, num_of_workers=10, timeout = 1): 53 self.workQueue = queue.Queue() 54 self.resultQueue = queue.Queue() 55 self.workers = [] 56 self.timeout = timeout 57 self._recruitThreads(num_of_workers) 58 59 def _recruitThreads(self, num_of_workers): 60 for i in range(num_of_workers): 61 worker = Worker(self.workQueue, self.resultQueue, self.timeout) 62 self.workers.append(worker) 63 64 def wait_for_complete(self): 65 # ...then, wait for each of them to terminate: 66 while len(self.workers): 67 worker = self.workers.pop() 68 worker.join() 69 if worker.isAlive() and not self.workQueue.empty(): 70 self.workers.append( worker ) 71 print("All jobs are are completed.") 72 73 def add_job( self, callable, *args, **kwds ): 74 self.workQueue.put( (callable, args, kwds) ) 75 76 def get_result( self, *args, **kwds ): 77 return self.resultQueue.get( *args, **kwds ) 78 79 80 81 ############################################################# 82 83 84 class Fetcher: 85 def __init__(self,manager): 86 self.opener = urllib.request.build_opener(urllib.request.HTTPHandler) 87 self.opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')] 88 self.lock = threading.Lock() #线程锁 89 self.q_req = queue.Queue() #任务队列 90 self.q_ans = queue.Queue() #完成队列 91 self.__q_retry = queue.Queue() #重试队列 92 #self.threads_num = threads_num + 1 # 1代表了重试线程 93 #self.__threads = [] 94 95 #重试线程 96 manager.add_job(self.threadretry,self) 97 #self.__retry_thread = threading.Thread(target=self.threadretry) 98 #self.__threads.append(self.__retry_thread) 99 #self.__retry_thread.setDaemon(True) 100 #self.__retry_thread.start() 101 102 #开启@threads_num个工作线程 103 manager.add_job(self.threadget,self) 104 #for i in range(threads_num): 105 # t = threading.Thread(target=self.threadget) 106 # self.__threads.append(t) 107 # t.setDaemon(True) 108 # t.start() 109 110 self.running = 0 111 112 #def __del__(self): #解构时需等待两个队列完成 113 #print("will delete") 114 #self.q_req.join() 115 #self.q_ans.join() 116 #self.__q_retry.join() 117 #print("deleted") 118 #for i in range(self.threads_num+1): 119 # self.__threads[i].exit() 120 #print("还有"+str(threading.active_count())+"个活跃线程") 121 122 def taskleft(self): 123 return self.q_req.qsize()+self.q_ans.qsize()+self.running 124 125 def push(self,req): 126 self.q_req.put(req) 127 128 def pop(self): 129 return self.q_ans.get() 130 131 def threadretry(self,_self): 132 while True: 133 try: 134 req = _self.__q_retry.get(timeout=1) #get(self,block=True,timeout=None) 135 except queue.Empty: 136 break 137 138 #with _self.lock: 139 _self.running += 1 140 141 try: 142 ans = _self.opener.open(req).read() 143 144 except urllib.error.URLError as e: 145 ans = '' 146 if hasattr(e, 'reason'): 147 print('We failed to reach a server.') 148 print('Reason: ', e.reason) 149 elif hasattr(e, 'code'): 150 print('The server cannot fulfill the request.') 151 print('Reason: ', e.code) 152 else: 153 if ans: 154 _self.q_ans.put((req,ans)) 155 finally: 156 #with _self.lock: 157 _self.running -= 1 158 159 _self.__q_retry.task_done() 160 161 162 163 def threadget(self,_self): 164 while True: 165 try: 166 req = _self.q_req.get(timeout=1) 167 except queue.Empty: 168 break 169 #with _self.lock: #要保证该操作的原子性,进入critical area 170 _self.running += 1 171 172 try: 173 ans = _self.opener.open(req).read() 174 except urllib.error.URLError as e: 175 ans = '' 176 if hasattr(e, 'reason'): 177 print('We failed to reach a server.') 178 print('Reason: ', e.reason) 179 _self.__q_retry.put(req) 180 elif hasattr(e, 'code'): 181 print('The server cannot fulfill the request.') 182 print('Reason: ', e.code) 183 _self.__q_retry.put(req) 184 else: 185 if ans: 186 _self.q_ans.put((req,ans)) 187 else: 188 _self.__q_retry.put(req) 189 finally: 190 #with _self.lock: 191 _self.running -= 1 192 _self.q_req.task_done() 193 194 195 def create_dir(userid,domain='qiushibaike'): 196 dir_name = domain + '/' + userid 197 try: 198 os.mkdir(dir_name) 199 except OSError as e: 200 if e.errno == errno.EEXIST and os.path.isdir(dir_name): 201 pass 202 else: 203 print(str(e)) 204 205 def userid_exist(userid): 206 return os.path.isdir('qiushibaike'+'/'+userid) 207 208 def get_file_name(userid): 209 current_time = time.strftime("%Y-%m-%d",time.localtime()) 210 return 'qiushibaike'+'/'+userid+'/'+current_time+'.txt' 211 212 213 214 def write_file(file,soup): 215 count=0 216 for ii in soup.find_all("div",class_="content clearfix"): 217 #print(ii.a["href"]) 218 #print(ii.a.text) 219 if ii.a.text: 220 count += 1 221 file.write(bytes(ii.a["href"],encoding="utf-8")) 222 file.write(bytes('rn',encoding="utf-8")) 223 file.write(bytes(ii.a.text,encoding="utf-8")) 224 file.write(bytes("rnrn",encoding="utf-8")) 225 return count 226 227 def get_max_page(soup): 228 #ii=bs4.element.Tag() 229 num=0 230 for jj in soup.find_all('a',rel="next",class_=None): 231 num=int(jj.text) 232 return num 233 234 235 def store_this_user(userid,manager): 236 if userid_exist(userid): 237 print("该用户貌似已经检索") 238 return 239 create_dir(userid) 240 241 file_name = get_file_name(userid) 242 file = open(file_name, 'wb') 243 244 ff = Fetcher(manager) 245 ff.push('http://www.qiushibaike.com/users/'+userid) 246 req,ans = ff.pop() 247 248 soup = BeautifulSoup(ans.decode('utf-8'),"html.parser") 249 user_name="" 250 for ii in soup.find_all('span',class_="user_center"): 251 user_name = ii.text 252 if not user_name: 253 del ff 254 file.close() 255 return 256 257 #把第一页的写进文件 258 count = write_file(file,soup) 259 print(user_name+" "+str(count)+"条糗事 [http://www.qiushibaike.com/users/"+userid+"/articles/page/1]") 260 261 #把余下的页面请求完 262 max_page = get_max_page(soup)+1 263 for i in range(2,max_page): 264 #print("加入列表 [http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i)+"]") 265 ff.push("http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i)) 266 267 while ff.taskleft(): 268 req,ans = ff.pop() 269 soup = BeautifulSoup(ans.decode('utf-8'),"html.parser") 270 count = write_file(file,soup) 271 print(user_name+" "+str(count)+"条糗事 ["+req+"]") 272 273 print(user_name+" 的线程资源被删除") 274 del ff 275 file.close() 276 return 277 278 279 280 def main(): 281 #os.mkdir('qiushibaike') 282 #store_this_user("13843355") 283 284 opener = urllib.request.build_opener(urllib.request.HTTPHandler) 285 opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')] 286 287 288 worker_manager = WorkerManager(128) 289 #manager = WorkerManager(128) 290 291 #ff = Fetcher(worker_manager) 292 next_link = 'http://www.qiushibaike.com' 293 #ff.push(next_link) 294 295 296 page_num=0 297 298 while True: 299 page_num += 1 300 print("自动翻到第"+str(page_num)+"页 " + next_link) 301 ans = opener.open(next_link).read() 302 next_link="" 303 #req,ans = ff.pop() 304 #print("adfsdfsdfsdfds") 305 soup = BeautifulSoup(ans.decode('utf-8'),"html.parser") 306 307 for ii in soup.find_all('a',class_="next",text="下一页"): 308 next_link = ii["href"] 309 if not next_link: 310 print("what the fuck!!!") 311 break 312 next_link = "http://www.qiushibaike.com"+next_link 313 #ff.push(next_link) 314 315 for ii in soup.find_all('div',class_="author"): 316 print(ii.a["href"].split('/')[2]) 317 store_this_user(ii.a["href"].split('/')[2], worker_manager) 318 319 320 321 ''' 322 323 file.close() 324 325 326 ff = Fetcher(10) 327 ff.push('http://www.qiushibaike.com/users/14870461') 328 req,ans = ff.pop() 329 330 print(ans.decode('utf8')) 331 332 333 #os.system("pause") 334 335 336 337 testgbk='汉字' 338 testunit=testgbk.encode('gbk') #--汉字解码 339 print(testunit) 340 341 testutf8=testgbk.encode('utf-8') #--转utf-8编码 342 print(testutf8) 343 344 testunit=testutf8.decode('utf-8') #--utf-8解码 345 print(testunit) 346 347 testgbk=testunit.encode('gbk') #--转gbk编码 348 print(testgbk) 349 ''' 350 351 352 353 ''' 354 links = ['http://item.jd.com/%d.html'%i for i in range(1746854,1746860)] 355 ff = Fetcher(10) 356 for url in links: 357 ff.push(url) 358 359 while ff.taskleft(): 360 (url,content) = ff.pop() 361 print(url,len(content)) 362 ''' 363 364 365 366 ''' 367 url = 'http://www.sina.com' 368 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 369 values = {'name' : 'Michael Foord', 370 'location' : 'pythontab', 371 'language' : 'Python' } 372 headers = { 'User-Agent' : user_agent } 373 374 data = urllib.parse.urlencode(values) 375 #req = urllib.request.Request(url, data, headers) 376 req = urllib.request.Request('http://www.baidu.com') 377 req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') 378 ''' 379 380 381 ''' 382 ################################## 383 def cbk(a, b, c): 384 385 #回调函数 386 #@a: 已经下载的数据块 387 #@b: 数据块的大小 388 #@c: 远程文件的大小 389 390 per = 100.0 * a * b / c 391 if per > 100: 392 per = 100 393 num = int(per) 394 395 396 print('[',end='') 397 for i in range(num): 398 print('#',end='') 399 print('%.2f]' %(per), end='') 400 401 url = 'http://www.sina.com.cn' 402 local = 'e:\sina.html' 403 urllib.request.urlretrieve(url, local, cbk) 404 405 input() 406 os.system("pause") 407 ################################## 408 ''' 409 410 ''' 411 try: 412 response = urllib.request.urlopen(req) 413 print('ffdfsdfsf') 414 except urllib.error.URLError as e: 415 if hasattr(e, 'reason'): 416 print('We failed to reach a server.') 417 print('Reason: ', e.reason) 418 elif hasattr(e, 'code'): 419 print('The server cannot fulfill the request.') 420 print('Reason: ', e.code) 421 else: 422 #print(response.info) 423 #print(response.getcode()) 424 response_context = response.read() 425 print(response_context.decode("utf8")) 426 ''' 427 428 429 if __name__=="__main__": 430 main()
分类:Default