python
python简单爬虫手机号-凯发ag旗舰厅登录网址下载
需求分析
项目上需要用到手机号前7位,判断号码是否合法,还有归属地查询。旧的数据是几年前了太久了,打算用python爬虫重新爬一份
单线程版本
# coding:utf-8
import requests
from datetime import datetime
class phoneinfospider:
def __init__(self, phonesections):
self.phonesections = phonesections
def phoneinfohandler(self, textdata):
text = textdata.splitlines(true)
# print("text length:" str(len(text)))
if len(text) >= 9:
number = text[1].split('\'')[1]
province = text[2].split('\'')[1]
mobile_area = text[3].split('\'')[1]
postcode = text[5].split('\'')[1]
line = "number:" number ",province:" province ",mobile_area:" mobile_area ",postcode:" postcode
line_text = number "," province "," mobile_area "," postcode
print(line_text)
# print("province:" province)
try:
f = open('./result.txt', 'a')
f.write(str(line_text) '\n')
except exception as e:
print(exception, ":", e)
def requestphoneinfo(self, phonenum):
try:
url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' phonenum
response = requests.get(url)
self.phoneinfohandler(response.text)
except exception as e:
print(exception, ":", e)
def requestallsections(self):
# last用于接上次异常退出前的号码
last = 0
# last = 4
# 自动生成手机号码,后四位补0
for head in self.phonesections:
head_begin = datetime.now()
print(head " begin time:" str(head_begin))
# for i in range(last, 10000):
for i in range(last, 10):
middle = str(i).zfill(4)
phonenum = head middle "0000"
self.requestphoneinfo(phonenum)
last = 0
head_end = datetime.now()
print(head " end time:" str(head_end))
if __name__ == '__main__':
task_begin = datetime.now()
print("phone check begin time:" str(task_begin))
# 电信,联通,移动,虚拟运营商
dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166']
yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172',
'178', '182', '183', '184', '187', '188', '198']
add = ['170']
all_num = dx lt yd add
# print(all_num)
print(len(all_num))
# 要爬的号码段
spider = phoneinfospider(all_num)
spider.requestallsections()
task_end = datetime.now()
print("phone check end time:" str(task_end))
发现爬取一个号段,共10000次查询,单线程版大概要多1个半小时,太慢了。
多线程版本
# coding:utf-8
import requests
from datetime import datetime
import queue
import threading
threadnum = 32
class mythread(threading.thread):
def __init__(self, func):
threading.thread.__init__(self)
self.func = func
def run(self):
self.func()
def requestphoneinfo():
global lock
while true:
lock.acquire()
if q.qsize() != 0:
print("queue size:" str(q.qsize()))
p = q.get() # 获得任务
lock.release()
middle = str(9999 - q.qsize()).zfill(4)
phonenum = phone_head middle "0000"
print("phonenum:" phonenum)
try:
url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' phonenum
# print(url)
response = requests.get(url)
# print(response.text)
phoneinfohandler(response.text)
except exception as e:
print(exception, ":", e)
else:
lock.release()
break
def phoneinfohandler(textdata):
text = textdata.splitlines(true)
if len(text) >= 9:
number = text[1].split('\'')[1]
province = text[2].split('\'')[1]
mobile_area = text[3].split('\'')[1]
postcode = text[5].split('\'')[1]
line = "number:" number ",province:" province ",mobile_area:" mobile_area ",postcode:" postcode
line_text = number "," province "," mobile_area "," postcode
print(line_text)
# print("province:" province)
try:
f = open('./result.txt', 'a')
f.write(str(line_text) '\n')
except exception as e:
print(exception, ":", e)
if __name__ == '__main__':
task_begin = datetime.now()
print("phone check begin time:" str(task_begin))
dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166']
yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178',
'182', '183', '184', '187', '188', '198']
all_num = dx lt yd
print(len(all_num))
for head in all_num:
head_begin = datetime.now()
print(head " begin time:" str(head_begin))
q = queue.queue()
threads = []
lock = threading.lock()
for p in range(10000):
q.put(p 1)
print(q.qsize())
for i in range(threadnum):
middle = str(i).zfill(4)
global phone_head
phone_head = head
thread = mythread(requestphoneinfo)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
head_end = datetime.now()
print(head " end time:" str(head_end))
task_end = datetime.now()
print("phone check end time:" str(task_end))
多线程版的1个号码段1000条数据,大概2,3min就好,cpu使用飙升,大概维持在70%左右。
总共40多个号段,爬完大概1,2个小时,总数据41w左右
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持我们。
本文标题: python手机号前7位归属地爬虫代码实例
本文地址: http://www.cppcns.com/jiaoben/python/304740.html
总结
以上是凯发ag旗舰厅登录网址下载为你收集整理的python简单爬虫手机号_python手机号前7位归属地爬虫代码实例的全部内容,希望文章能够帮你解决所遇到的问题。
如果觉得凯发ag旗舰厅登录网址下载网站内容还不错,欢迎将凯发ag旗舰厅登录网址下载推荐给好友。
- 上一篇: python field readonl
- 下一篇: python函数的返回值是返回引用吗_p