import math
import random
import re
import sys
import threading
from time import ctime, sleep
from lxml import etree
import pprint
import requests
from selenium import webdriver
f = open('spider_北上广深_district.txt', 'r', encoding='utf-8')
f.closed
POOL_URL_DISTRICT_LIST = []
for i in f:
d = i.replace('\n', '').replace(' ', '').split('"')
for ii in d:
if ii.find('http') > -1:
POOL_URL_DISTRICT_LIST.append(ii)
POOL_URL_DISTRICT_MAXPAGE_NUM_DIC = {}
res_dic = {}
POOL_URL_DISTRICT_LIST_B = []
MAX_PAGE_NUM = 100
def gen_url(num=MAX_PAGE_NUM):
for url in POOL_URL_DISTRICT_LIST:
l = url.split('//')[1].split('lianjia')
[city, district] = l
city = city[:-1]
district = district.split('.com/')[1]
if city != 'sh':
url_ = '%s%s/%s' % ('https://m.lianjia.com/', city, district)
else:
url_ = '%s%s/' % ('http://m.sh.lianjia.com/', district)
POOL\_URL\_DISTRICT\_MAXPAGE\_NUM\_DIC\[url\_\] = num
POOL\_URL\_DISTRICT\_LIST\_B.append(url\_)
gen_url()
exception_url_list = []
URL_NUM_EACH_THREAD = 100 * 0.6 * 4
res_dic = {}
todo_url_list = []
for url_ in POOL_URL_DISTRICT_LIST_B:
if url_.find('sh.') > -1:
page_addition = 'd'
else:
page_addition = 'pg'
for page_num in range(1, MAX_PAGE_NUM, 1):
url = '%s%s%s/' % (url_, page_addition, page_num)
todo_url_list.append(url)
LEN = len(todo_url_list)
browser = webdriver.Firefox()
def grab_todo_url_list(browser):
global res_dic, todo_url_list
d = random.randint(1, 2)
if d % 2 == 0:
todo\_url\_list.reverse()
my\_control = len(todo\_url\_list)
my\_control\_start = random.randint(0, my\_control)
for i in range(my\_control\_start, my\_control, 1):
if len(todo\_url\_list) > i:
url = todo\_url\_list\[i\]
if url not in todo\_url\_list:
continue
sleep(1)
browser.get(url)
html = browser.page\_source
web\_site = ''
url\_pass\_flag = 0
if html.find('price\_total') > -1:
selector = etree.HTML(html)
url\_l = selector.xpath('//a\[@class="a\_mask"\]/@href')
des\_l = selector.xpath('//div\[@class="item\_other text\_cut"\]/text()')
price\_total\_l = selector.xpath('//span\[@class="price\_total"\]/em/text()')
unit\_price\_l = selector.xpath('//span\[@class="unit\_price"\]/text()')
url\_pass\_flag = 1
elif html.find('xiaoquname') > -1:
web\_site = 'sh'
selector = etree.HTML(html)
url\_l = selector.xpath('//ul\[@class="fang-list"\]/li/a/@href')
xiaoquname\_l = selector.xpath('//span\[@class="xiaoquname"\]/text()')
area\_l = selector.xpath('//p\[@class="f-area"\]/text()')
price\_total\_l = selector.xpath('//span\[@class="f-price"\]/text()')
url\_pass\_flag = 1
# https://m.lianjia.com/bj/ershoufang/yizhuangkaifaqu/pg87
# http://m.sh.lianjia.com/ershoufang/jinshan/d78
elif html.find('搜索条件') > -1 or url.find('/lf/') > -1:
print(111, url)
url\_pass\_flag = 2
if url\_pass\_flag == 1:
res\_dic\[url\] = {}
len\_l = len(url\_l)
res\_dic\[url\]\['items\_list'\] = \[\]
len\_l\_ = len\_l - 1
for i in range(0, len\_l\_, 1):
d = {}
d\['spider\_url'\] = url
d\['item\_url'\] = url\_l\[i\]
d\['des'\] = des\_l\[i\] if web\_site == '' else '%s||%s' % (
area\_l\[i\].replace('\\n', '').replace(',', '').replace(' ', ''),
xiaoquname\_l\[i\].replace(',', '').replace(' ', ''))
d\['price\_total'\] = price\_total\_l\[i\]
d\['unit\_price'\] = unit\_price\_l\[i\] if web\_site == '' else 'sh'
res\_dic\[url\]\['items\_list'\].append(d)
if url\_pass\_flag != 0:
if url in todo\_url\_list:
l\_index = todo\_url\_list.index(url)
del todo\_url\_list\[l\_index\]
else:
if url not in todo\_url\_list:
todo\_url\_list.append(url)
browser.close()
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
MAX_EXCEPTION_URL_NUM = 0
def deal_exception_url_list():
global todo_url_list
browser = webdriver.Firefox()
if len(todo_url_list) > MAX_EXCEPTION_URL_NUM:
grab_todo_url_list(browser)
else:
return
deal_exception_url_list()
POOL_URL_LEN_B = len(POOL_URL_DISTRICT_LIST_B)
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(LEN / URL_NUM_EACH_THREAD)
for nloop in range(0, thread_sum, 1):
browser = webdriver.Firefox()
thread_instance = MyThread(grab_todo_url_list, (browser), grab_todo_url_list.__name__)
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
# pprint.pprint(res_dic)
deal_exception_url_list()
print('end_r:', ctime())
f_name = 'mobile_lianjia_ershoufang_BSGS.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
str = 'spider_url,item_url,des,price_total,unit_price\n'
f.write(str)
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
for url in res\_dic:
try:
for d in res\_dic\[url\]\['items\_list'\]:
str = '%s,%s,%s,%s,%s\\n' % (d\['spider\_url'\], d\['item\_url'\], d\['des'\], d\['price\_total'\], d\['unit\_price'\])
f.write(str)
except Exception:
print(Exception)
f.closed
print('end\_w:', ctime())
if __name__ == '__main__':
main()
# -*- coding: UTF-8 -*-
import math
import random
import sys
import threading
from time import ctime, sleep
import requests
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
QPS_TIME_UNIT = 1
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
keywords = '&keywords='
OFFSET = '&offset=2'
CITYLIMIT = '&citylimit=false'
POI_TYPES = '&types=060100|060101|060102|060400'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
change_key_qps = 0
def change_key():
global touse_key, change_key_qps
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
return
else:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
try:
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(QPS_TIME_UNIT)
r = requests.get(url)
json_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change_key()
return
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
return
return
FNAME = '【商场任务】28个城市_任务列表_20170727 - 副本.csv'
tosupply_dic = {}
todo_list = []
fo = open(FNAME, 'r', encoding='gbk')
file_line_num = 0
for i in fo:
file_line_num += 1
if file_line_num == 1:
continue
todo_list.append(file_line_num)
tosupply_dic[file_line_num] = {}
l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
dic_ = {}
dic_['sequence_number'] = l[0]
dic_['type'] = l[1]
dic_['city'] = l[2]
dic_['district'] = l[3]
dic_['address'] = l[4]
dic_['name'] = l[5]
dic_['gd_type_1'], dic_['gd_type_2'], dic_['gd_type_3'], dic_['gd_name'], dic_['gd_province'], dic_['gd_city'], \
dic_['gd_district'], dic_['gd_address'] = ['', '', '', '', '', '', '', '']
tosupply_dic[file_line_num] = dic_
LEN = len(todo_list)
EACH_THREAD_REQUEST_NUM = 30
requests_counter = 0
tosupply_dic_len = len(tosupply_dic)
tosupply_dic_len_ = tosupply_dic_len - 1
def supply_dic(nloop):
global tosupply_dic, requests_counter, todo_list
print(len(todo_list))
d = random.randint(1, 2)
if d % 2 == 0:
todo_list.reverse()
for file\_line\_num in todo\_list:
if file\_line\_num not in todo\_list:
continue
t = threading.current\_thread()
print('nloop=',nloop)
print(' t.\_ident=',t.\_ident)
dic\_ = tosupply\_dic\[file\_line\_num\]
city = dic\_\['district'\]
name = dic\_\['name'\]
url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL\_TYPE, touse\_key, name, city, POI\_TYPES, OFFSET, CITYLIMIT)
if requests\_counter % QPS == 0:
sleep(QPS\_TIME\_UNIT)
try:
r = requests.get(url)
r\_json = r.json()
except Exception:
if file\_line\_num not in todo\_list:
todo\_list.append(file\_line\_num)
continue
infocode = r\_json\['infocode'\]
if infocode == '':
count = r\_json\['count'\]
if int(count) > 0:
pois\_list = r\_json\['pois'\]
pos\_dic = pois\_list\[0\]
tosupply\_dic\[file\_line\_num\]\['gd\_type\_one'\] = pos\_dic\['type'\]
tosupply\_dic\[file\_line\_num\]\['gd\_type\_1'\], tosupply\_dic\[file\_line\_num\]\['gd\_type\_2'\], \\
tosupply\_dic\[file\_line\_num\]\['gd\_type\_3'\] = pos\_dic\['type'\].split('|')\[0\].split(';')
tosupply\_dic\[file\_line\_num\]\['gd\_province'\] = pos\_dic\['pname'\]
tosupply\_dic\[file\_line\_num\]\['gd\_city'\] = pos\_dic\['cityname'\]
tosupply\_dic\[file\_line\_num\]\['gd\_district'\] = pos\_dic\['adname'\]
tosupply\_dic\[file\_line\_num\]\['gd\_address'\] = pos\_dic\['address'\]
elif int(count) == 0:
tosupply\_dic\[file\_line\_num\]\['gd\_name'\] = 'GD-NO-DATA'
if file\_line\_num in todo\_list:
list\_index = todo\_list.index(file\_line\_num)
del todo\_list\[list\_index\]
print(file\_line\_num)
else:
if file\_line\_num not in todo\_list:
todo\_list.append(file\_line\_num)
change\_key()
MAX_EXCEPTION_URL_NUM = 0
def deal_exception_list():
global todo_list
print(todo_list)
if len(todo_list) > MAX_EXCEPTION_URL_NUM:
supply_dic()
else:
return
deal_exception_list()
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
print(185, thread_sum)
for nloop in range(0, thread_sum, 1):
thread_instance = MyThread(supply_dic,(nloop),supply_dic.__name__)
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
deal_exception_list()
FGEN = 'GEN\_28.csv'
fo = open(FGEN, 'w', encoding='utf-8-sig')
fo.write(
'序号,类别编号,城市名称,区域 地址,商圈名,gd\_type\_one,gd\_type\_1,gd\_type\_2,gd\_type\_3,gd\_name,gd\_province,gd\_city,gd\_district,gd\_address\\n')
fo.closed
fo = open(FGEN, 'a', encoding='utf-8-sig')
for file\_line\_num in tosupply\_dic:
if file\_line\_num == 1:
continue
dic\_ = tosupply\_dic\[file\_line\_num\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
dic\_\['sequence\_number'\], dic\_\['type'\], dic\_\['city'\], dic\_\['district'\], dic\_\['address'\],
dic\_\['name'\], dic\_\['gd\_type\_one'\], dic\_\['gd\_type\_1'\], dic\_\['gd\_type\_2'\], dic\_\['gd\_type\_3'\], dic\_\['gd\_name'\],
dic\_\['gd\_province'\],
dic\_\['gd\_city'\],
dic\_\['gd\_district'\],
dic\_\['gd\_address'\])
fo.write(str)
fo.closed
if __name__ == '__main__':
main()
16G内存OK 4G内存 --- no
# -*- coding: UTF-8 -*-
import math
import random
import sys
import threading
from time import ctime, sleep
import requests
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
QPS_TIME_UNIT = 1
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
keywords = '&keywords='
OFFSET = '&offset=2'
CITYLIMIT = '&citylimit=false'
POI_TYPES = '&types=080601'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
change_key_qps = 0
def change_key():
global touse_key, change_key_qps
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
return
else:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
try:
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(QPS_TIME_UNIT)
r = requests.get(url)
json_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change_key()
return
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
return
return
FNAME = '电影院任务列表_20170724.csv'
tosupply_dic = {}
fo = open(FNAME, 'r', encoding='gbk')
file_line_num = 0
for i in fo:
file_line_num += 1
if file_line_num == 1:
continue
tosupply_dic[file_line_num] = {}
is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
dic_ = {}
dic_['sequence_number'] = l[0]
dic_['area'] = l[1] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['area']
dic_['province'] = l[2] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1][
'province']
dic_['city'] = l[3] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['city']
dic_['district'] = ''
dic_['address'] = ''
dic_['buliding'] = ''
dic_['longitude_latitude'] = ''
dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
dic_['parent_company'] = l[10] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['parent_company']
dic_['is_from_past_line'] = is_from_past_line
dic_['gd_name'] = ''
dic_['gd_city'] = ''
tosupply_dic[file_line_num] = dic_
EACH_THREAD_REQUEST_NUM = 30
exception_line_num_list = []
requests_counter = 0
tosupply_dic_len = len(tosupply_dic)
tosupply_dic_len_ = tosupply_dic_len - 1
def supply_dic(thread_strat_file_line_num):
global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
file_line_num = thread_strat_file_line_num + loop
if file_line_num - 2 > tosupply_dic_len_:
return
if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
if file_line_num in exception_line_num_list:
list_index = exception_line_num_list.index(file_line_num)
del exception_line_num_list[list_index]
continue
dic_ = tosupply_dic[file_line_num]
city = dic_['city']
name = dic_['name']
url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
if requests_counter % QPS == 0:
sleep(QPS_TIME_UNIT)
try:
r = requests.get(url)
r_json = r.json()
except Exception:
if file_line_num not in exception_line_num_list:
exception_line_num_list.append(file_line_num)
continue
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
pos_dic = pois_list[0]
tosupply_dic[file_line_num]['district'] = pos_dic['adname']
if len(pos\_dic\['address'\]) <= 2:
print(pos\_dic)
print(pos\_dic\['address'\])
tosupply\_dic\[file\_line\_num\]\['address'\] = pos\_dic\['address'\] if len(pos\_dic\['address'\]) > 2 else '高德缺地址'
if len(pos\_dic\['address'\]) <= 2:
print(tosupply\_dic\[file\_line\_num\]\['address'\])
tosupply\_dic\[file\_line\_num\]\['longitude\_latitude'\] = pos\_dic\['location'\]
tosupply\_dic\[file\_line\_num\]\['gd\_name'\] = pos\_dic\['name'\]
tosupply\_dic\[file\_line\_num\]\['gd\_city'\] = pos\_dic\['cityname'\]
elif int(count) == 0:
tosupply\_dic\[file\_line\_num\]\['gd\_name'\] = 'GD-NO-DATA'
if file\_line\_num in exception\_line\_num\_list:
list\_index = exception\_line\_num\_list.index(file\_line\_num)
del exception\_line\_num\_list\[list\_index\]
else:
if file\_line\_num not in exception\_line\_num\_list:
exception\_line\_num\_list.append(file\_line\_num)
change\_key()
MAX_EXCEPTION_URL_NUM = 0
def deal_exception_list():
global exception_line_num_list
print(exception_line_num_list)
if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
for thread_strat_file_line_num in exception_line_num_list:
supply_dic(thread_strat_file_line_num)
else:
return
deal_exception_list()
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def gen_building(str):
start_ = 0
end_ = len(str)
res = ''
start_flag = 0
end_flag = 0
if str.find('号') > -1:
start_ = str.find('号') + 1
start_flag = 1
elif str.find('交汇处') > -1:
start_ = str.find('交汇处') + 1 + 2
start_flag = 1
elif str.find('交叉口') > -1:
start_ = str.find('交叉口') + 1 + 2
start_flag = 1
elif str.find('路') > -1:
start_ = str.find('路') + 1
start_flag = 1
elif str.find('道') > -1:
start_ = str.find('道') + 1
start_flag = 1
if str.find('层') > -1:
end\_ = str.find('层') + 1
end\_flag = 1
elif str.find('楼') > -1:
end\_ = str.find('楼') + 1
end\_flag = 1
if start\_flag == 1 or end\_flag == 1:
res = ''.join((list(str)\[start\_:end\_\]))
if res.find('(') > -1 or res.find('(') > -1:
# new rule
res = res.replace('(', '').replace(')', '').replace('(', '').replace(')', '')
return res
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
print(185, thread_sum)
for nloop in range(0, thread_sum, 1):
thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
print(thread_strat_file_line_num)
thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
# pprint.pprint(res_dic)
deal\_exception\_list()
for i in exception\_line\_num\_list:
print('EXCEPTION', i)
FGEN = '电影院任务列表\_20170724\_新增列\_已计算楼宇.csv'
fo = open(FGEN, 'w', encoding='utf-8-sig')
fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is\_from\_past\_line,gd\_name,gd\_city\\n')
fo.closed
fo = open(FGEN, 'a', encoding='utf-8-sig')
for file\_line\_num in tosupply\_dic:
if file\_line\_num == 1:
continue
dic\_ = tosupply\_dic\[file\_line\_num\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
dic\_\['sequence\_number'\], dic\_\['area'\], dic\_\['province'\], dic\_\['city'\], dic\_\['district'\],
dic\_\['address'\].replace(',', ' '),
gen\_building(dic\_\['address'\]), dic\_\['longitude\_latitude'\].replace(',', ' '), dic\_\['busniess\_type'\],
dic\_\['name'\],
dic\_\['parent\_company'\],
dic\_\['is\_from\_past\_line'\], dic\_\['gd\_name'\], dic\_\['gd\_city'\])
fo.write(str)
fo.closed
if __name__ == '__main__':
main()
# -*- coding: UTF-8 -*-
import math
import random
import sys
import threading
from time import ctime, sleep
import requests
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
QPS_TIME_UNIT = 1
INFOCODE_OK = '10000'
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
keywords = '&keywords='
OFFSET = '&offset=2'
CITYLIMIT = '&citylimit=false'
POI_TYPES = '&types=080601'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
change_key_qps = 0
def change_key():
global touse_key, change_key_qps
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
return
else:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
try:
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(QPS_TIME_UNIT)
r = requests.get(url)
json_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change_key()
return
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
return
return
FNAME = '电影院任务列表_20170724.csv'
tosupply_dic = {}
fo = open(FNAME, 'r', encoding='gbk')
file_line_num = 0
for i in fo:
file_line_num += 1
if file_line_num == 1:
continue
tosupply_dic[file_line_num] = {}
is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
dic_ = {}
dic_['sequence_number'] = l[0]
dic_['area'] = l[1] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['area']
dic_['province'] = l[2] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1][
'province']
dic_['city'] = l[3] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['city']
dic_['district'] = ''
dic_['address'] = ''
dic_['buliding'] = ''
dic_['longitude_latitude'] = ''
dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
dic_['parent_company'] = l[10] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['parent_company']
dic_['is_from_past_line'] = is_from_past_line
dic_['gd_name'] = ''
dic_['gd_city'] = ''
tosupply_dic[file_line_num] = dic_
EACH_THREAD_REQUEST_NUM = 30
exception_line_num_list = []
requests_counter = 0
tosupply_dic_len = len(tosupply_dic)
tosupply_dic_len_ = tosupply_dic_len - 1
def supply_dic(thread_strat_file_line_num):
global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
file_line_num = thread_strat_file_line_num + loop
if file_line_num - 2 > tosupply_dic_len_:
return
if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
if file_line_num in exception_line_num_list:
list_index = exception_line_num_list.index(file_line_num)
del exception_line_num_list[list_index]
continue
dic_ = tosupply_dic[file_line_num]
city = dic_['city']
name = dic_['name']
url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
if requests_counter % QPS == 0:
sleep(QPS_TIME_UNIT)
try:
r = requests.get(url)
r_json = r.json()
except Exception:
if file_line_num not in exception_line_num_list:
exception_line_num_list.append(file_line_num)
continue
infocode = r_json['infocode']
if infocode == '10000':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
pos_dic = pois_list[0]
tosupply_dic[file_line_num]['district'] = pos_dic['adname']
if len(pos\_dic\['address'\]) <= 2:
print(pos\_dic)
print(pos\_dic\['address'\])
tosupply\_dic\[file\_line\_num\]\['address'\] = pos\_dic\['address'\] if len(pos\_dic\['address'\]) > 2 else '高德缺地址'
if len(pos\_dic\['address'\]) <= 2:
print(tosupply\_dic\[file\_line\_num\]\['address'\])
tosupply\_dic\[file\_line\_num\]\['longitude\_latitude'\] = pos\_dic\['location'\]
tosupply\_dic\[file\_line\_num\]\['gd\_name'\] = pos\_dic\['name'\]
tosupply\_dic\[file\_line\_num\]\['gd\_city'\] = pos\_dic\['cityname'\]
elif int(count) == 0:
tosupply\_dic\[file\_line\_num\]\['gd\_name'\] = 'GD-NO-DATA'
if file\_line\_num in exception\_line\_num\_list:
list\_index = exception\_line\_num\_list.index(file\_line\_num)
del exception\_line\_num\_list\[list\_index\]
else:
if file\_line\_num not in exception\_line\_num\_list:
exception\_line\_num\_list.append(file\_line\_num)
change\_key()
MAX_EXCEPTION_URL_NUM = 0
def deal_exception_list():
global exception_line_num_list
print(exception_line_num_list)
if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
for thread_strat_file_line_num in exception_line_num_list:
supply_dic(thread_strat_file_line_num)
else:
return
deal_exception_list()
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def gen_building(str):
start_ = 0
end_ = len(str)
res = ''
start_flag = 0
end_flag = 0
if str.find('号') > -1:
start_ = str.find('号') + 1
start_flag = 1
elif str.find('交汇处') > -1:
start_ = str.find('交汇处') + 1 + 2
start_flag = 1
elif str.find('交叉口') > -1:
start_ = str.find('交叉口') + 1 + 2
start_flag = 1
elif str.find('路') > -1:
start_ = str.find('路') + 1
start_flag = 1
elif str.find('道') > -1:
start_ = str.find('道') + 1
start_flag = 1
if str.find('层') > -1:
end\_ = str.find('层') + 1
end\_flag = 1
elif str.find('楼') > -1:
end\_ = str.find('楼') + 1
end\_flag = 1
if start\_flag == 1 or end\_flag == 1:
res = ''.join((list(str)\[start\_:end\_\]))
if res.find('('):
# new rule
res = res.replace('(', '').replace(')', '')
return res
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
print(185, thread_sum)
for nloop in range(0, thread_sum, 1):
thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
print(thread_strat_file_line_num)
thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
# pprint.pprint(res_dic)
deal\_exception\_list()
for i in exception\_line\_num\_list:
print('EXCEPTION', i)
FGEN = '电影院任务列表\_20170724\_新增列\_已计算楼宇.csv'
fo = open(FGEN, 'w', encoding='utf-8-sig')
fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is\_from\_past\_line,gd\_name,gd\_city\\n')
fo.closed
fo = open(FGEN, 'a', encoding='utf-8-sig')
for file\_line\_num in tosupply\_dic:
if file\_line\_num == 1:
continue
dic\_ = tosupply\_dic\[file\_line\_num\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
dic\_\['sequence\_number'\], dic\_\['area'\], dic\_\['province'\], dic\_\['city'\], dic\_\['district'\],
dic\_\['address'\].replace(',', ' '),
gen\_building(dic\_\['address'\]), dic\_\['longitude\_latitude'\].replace(',', ' '), dic\_\['busniess\_type'\],
dic\_\['name'\],
dic\_\['parent\_company'\],
dic\_\['is\_from\_past\_line'\], dic\_\['gd\_name'\], dic\_\['gd\_city'\])
fo.write(str)
fo.closed
if __name__ == '__main__':
main()
高德缺地址
{'id': 'B0FFGJYCPN', 'name': '横店电影城(原阳店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '113.960307,35.065736', 'tel': '0373-5911199', 'distance': [], 'biz_ext': [], 'pname': '河南省', 'cityname': '新乡市', 'adname': '原阳县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
[]
高德缺地址
{'id': 'B0FFH0368R', 'name': '横店电影城', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '118.430053,29.862110', 'tel': [], 'distance': [], 'biz_ext': [], 'pname': '安徽省', 'cityname': '黄山市', 'adname': '歙县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
[]
高德缺地址
[300, 30, 2, 390, 330, 480, 570, 900, 840, 450, 1140, 630, 990, 180, 1050, 90, 240, 360, 720, 750, 690, 1170, 60, 1230, 960, 210, 1200, 930, 510, 150, 600, 870, 1080, 810, 660, 540, 270, 420, 1110, 120, 780, 1020, 1113, 813, 544, 69, 94, 119]
[390, 330, 570, 840, 1140, 990, 1050, 240, 720, 690, 60, 960, 1200, 510, 600, 1080, 660, 270, 1110, 780, 1113]
[330, 840, 990, 240, 690, 960, 510, 1080, 270, 780]
[840, 240, 960, 1080, 780]
[240, 1080]
[1080]
[]
# -*- coding: UTF-8 -*-
import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep
import random
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
QPS_TIME_UNIT = 1
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
keywords = '&keywords='
OFFSET = '&offset=2'
CITYLIMIT = '&citylimit=false'
POI_TYPES = '&types=080601'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
change_key_qps = 0
def change_key():
global touse_key, change_key_qps
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
return
else:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
try:
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(QPS_TIME_UNIT)
r = requests.get(url)
json_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change_key()
return
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
return
return
FNAME = '电影院任务列表_20170724.csv'
tosupply_dic = {}
fo = open(FNAME, 'r', encoding='gbk')
file_line_num = 0
for i in fo:
file_line_num += 1
if file_line_num == 1:
continue
tosupply_dic[file_line_num] = {}
is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
dic_ = {}
dic_['sequence_number'] = l[0]
dic_['area'] = l[1] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['area']
dic_['province'] = l[2] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1][
'province']
dic_['city'] = l[3] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['city']
dic_['district'] = ''
dic_['address'] = ''
dic_['buliding'] = ''
dic_['longitude_latitude'] = ''
dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
dic_['parent_company'] = l[10] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['parent_company']
dic_['is_from_past_line'] = is_from_past_line
dic_['gd_name'] = ''
dic_['gd_city'] = ''
tosupply_dic[file_line_num] = dic_
EACH_THREAD_REQUEST_NUM = 30
exception_line_num_list = []
requests_counter = 0
tosupply_dic_len = len(tosupply_dic)
tosupply_dic_len_ = tosupply_dic_len - 1
def supply_dic(thread_strat_file_line_num):
global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
file_line_num = thread_strat_file_line_num + loop
if file_line_num - 2 > tosupply_dic_len_:
return
if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
if file_line_num in exception_line_num_list:
list_index = exception_line_num_list.index(file_line_num)
del exception_line_num_list[list_index]
continue
dic_ = tosupply_dic[file_line_num]
city = dic_['city']
name = dic_['name']
url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
if requests_counter % QPS == 0:
sleep(QPS_TIME_UNIT)
try:
r = requests.get(url)
r_json = r.json()
except Exception:
if file_line_num not in exception_line_num_list:
exception_line_num_list.append(file_line_num)
continue
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
pos_dic = pois_list[0]
tosupply_dic[file_line_num]['district'] = pos_dic['adname']
if len(pos\_dic\['address'\]) <= 2:
print(pos\_dic)
print(pos\_dic\['address'\])
tosupply\_dic\[file\_line\_num\]\['address'\] = pos\_dic\['address'\] if len(pos\_dic\['address'\]) > 2 else '高德缺地址'
if len(pos\_dic\['address'\]) <= 2:
print(tosupply\_dic\[file\_line\_num\]\['address'\])
tosupply\_dic\[file\_line\_num\]\['longitude\_latitude'\] = pos\_dic\['location'\]
tosupply\_dic\[file\_line\_num\]\['gd\_name'\] = pos\_dic\['name'\]
tosupply\_dic\[file\_line\_num\]\['gd\_city'\] = pos\_dic\['cityname'\]
elif int(count) == 0:
tosupply\_dic\[file\_line\_num\]\['gd\_name'\] = 'GD-NO-DATA'
if file\_line\_num in exception\_line\_num\_list:
list\_index = exception\_line\_num\_list.index(file\_line\_num)
del exception\_line\_num\_list\[list\_index\]
else:
if file\_line\_num not in exception\_line\_num\_list:
exception\_line\_num\_list.append(file\_line\_num)
change\_key()
MAX_EXCEPTION_URL_NUM = 0
def deal_exception_list():
global exception_line_num_list
print(exception_line_num_list)
if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
for thread_strat_file_line_num in exception_line_num_list:
supply_dic(thread_strat_file_line_num)
else:
return
deal_exception_list()
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def gen_building(str):
start_ = 0
end_ = 0
if str.find('号') > -1:
start_ = str.find('号') + 1
elif str.find('路') > -1:
start_ = str.find('路') + 1
elif str.find('道') > -1:
start_ = str.find('道') + 1
if str.find('层') > -1:
end\_ = str.find('层') + 1
elif str.find('楼') > -1:
end\_ = str.find('楼') + 1
if end\_ - start\_ > 3:
return ''.join((list(str)\[start\_:end\_\]))
else:
return ''
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
print(185, thread_sum)
for nloop in range(0, thread_sum, 1):
thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
print(thread_strat_file_line_num)
thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
# pprint.pprint(res_dic)
deal\_exception\_list()
for i in exception\_line\_num\_list:
print('EXCEPTION', i)
FGEN = '电影院任务列表\_20170724\_新增列\_已计算楼宇.csv'
fo = open(FGEN, 'w', encoding='utf-8-sig')
fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is\_from\_past\_line,gd\_name,gd\_city\\n')
fo.closed
fo = open(FGEN, 'a', encoding='utf-8-sig')
for file\_line\_num in tosupply\_dic:
if file\_line\_num == 1:
continue
dic\_ = tosupply\_dic\[file\_line\_num\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
dic\_\['sequence\_number'\], dic\_\['area'\], dic\_\['province'\], dic\_\['city'\], dic\_\['district'\],
dic\_\['address'\].replace(',', ' '),
gen\_building(dic\_\['address'\]), dic\_\['longitude\_latitude'\].replace(',', ' '), dic\_\['busniess\_type'\],
dic\_\['name'\],
dic\_\['parent\_company'\],
dic\_\['is\_from\_past\_line'\], dic\_\['gd\_name'\], dic\_\['gd\_city'\])
fo.write(str)
fo.closed
if __name__ == '__main__':
main()
{'id': 'B02DD0R6M6', 'name': '横店电影城(大汉店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '113.149267,27.838133', 'tel': '0731-22915555', 'distance': [], 'biz_ext': [], 'pname': '湖南省', 'cityname': '株洲市', 'adname': '芦淞区', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
[]
高德缺地址
{'id': 'B0FFGJYCPN', 'name': '横店电影城(原阳店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '113.960307,35.065736', 'tel': '0373-5911199', 'distance': [], 'biz_ext': [], 'pname': '河南省', 'cityname': '新乡市', 'adname': '原阳县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
[]
高德缺地址
{'id': 'B0FFH0368R', 'name': '横店电影城', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '118.430053,29.862110', 'tel': [], 'distance': [], 'biz_ext': [], 'pname': '安徽省', 'cityname': '黄山市', 'adname': '歙县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
[]
高德缺地址
[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ]
[, , , , , , , , , , , , , , , , , , , , ]
if url in exception\_url\_list:
l\_index = exception\_url\_list.index(url)
print(139, 'del')
del exception\_url\_list\[l\_index\]
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
MAX_EXCEPTION_URL_NUM = 60
def deal_exception_url_list():
global exception_url_list
if len(exception_url_list) > MAX_EXCEPTION_URL_NUM:
for url in exception_url_list:
grab_one_url(url)
else:
return
deal_exception_url_list()
# -*- coding: UTF-8 -*-
import math
import random
import re
import sys
import threading
from time import ctime, sleep
from lxml import etree
import pprint
import requests
BASE_URL = 'https://www.dianping.com/'
url_district_list = []
GLUE = 'search'
f = open('spider_深圳_district_bussi-nav_url_list - 副本.txt', 'r', encoding='utf-8')
f.closed
SCALA = 'SCALA'
for i in f:
URL_POOL = i.split(SCALA)
break
URL_POOL_LEN = len(URL_POOL)
URL_NUM_EACH_THREAD = 1
res_dic = {}
MAX_PAGE_NUM = 50
QPS = 30
request_counter = 0
QPS_TIME_UNIT = 1
exception_url_list = []
def grab_one_url(url):
global res_dic, request_counter, exception_url_list
if request_counter % QPS == 0:
print(36, 'sleep', request_counter)
sleep(QPS_TIME_UNIT)
request\_counter += 1
try:
r = requests.get(url)
except Exception:
if url not in exception\_url\_list:
exception\_url\_list.append(url)
print(exception\_url\_list)
return
html = r.text
selector = etree.HTML(html)
page\_title = selector.xpath('//title/text()')\[0\]
try:
page\_Keywords = selector.xpath('//meta\[@name="Keywords"\]')\[0\].attrib\['content'\].replace(',', '、')
except Exception:
if url not in exception\_url\_list:
exception\_url\_list.append(url)
print(exception\_url\_list)
return
data\_ga\_index\_1 = selector.xpath('.//a\[@data-ga-index="1"\]/span/text()')\[0\]
data\_ga\_index\_2 = selector.xpath('.//a\[@data-ga-index="2"\]/span/text()')\[0\]
data\_ga\_index\_3 = selector.xpath('.//a\[@data-ga-index="3"\]/span/text()')\[0\]
res\_dic\[url\] = {}
res\_dic\[url\]\['page\_title'\] = page\_title
res\_dic\[url\]\['page\_Keywords'\] = page\_Keywords
res\_dic\[url\]\['data\_ga\_index\_1'\] = data\_ga\_index\_1
res\_dic\[url\]\['data\_ga\_index\_2'\] = data\_ga\_index\_2
res\_dic\[url\]\['data\_ga\_index\_3'\] = data\_ga\_index\_3
name\_l = selector.xpath('.//li\[@class=""\]//h4/text()')
mean\_price\_l = selector.xpath('.//li\[@class=""\]//a\[@class="mean-price"\]/b/text()')
flavour\_l = selector.xpath('.//li\[@class=""\]//div\[@class="tag-addr"\]/a\[1\]/span/text()')
position\_l = selector.xpath('.//li\[@class=""\]//div\[@class="tag-addr"\]/a\[2\]/span/text()')
address\_l = selector.xpath('.//li\[@class=""\]//div\[@class="tag-addr"\]/span/text()')
len\_l = len(name\_l)
res\_dic\[url\]\['items\_num'\] = len\_l
res\_dic\[url\]\['items\_list'\] = \[\]
len\_l\_ = len\_l - 1
for i in range(0, len\_l\_, 1):
d = {}
d\['name'\] = name\_l\[i\].replace(',', '、')
d\['mean\_price'\] = mean\_price\_l\[i\] if len(mean\_price\_l) - 1 >= i else ''
d\['flavour'\] = flavour\_l\[i\]
d\['position'\] = position\_l\[i\]
# https://www.dianping.com/search/category/7/10/r12335p1
d\['address'\] = address\_l\[i\].replace(',', '、')
res\_dic\[url\]\['items\_list'\].append(d)
if url in exception\_url\_list:
l\_index = exception\_url\_list.index(url)
print(139, 'del')
del exception\_url\_list\[l\_index\]
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
MAX_EXCEPTION_URL_NUM = 60
def deal_exception_url_list():
global exception_url_list
if len(exception_url_list) > MAX_EXCEPTION_URL_NUM:
for url in exception_url_list:
grab_one_url(url)
else:
return
deal_exception_url_list()
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(URL_POOL_LEN / URL_NUM_EACH_THREAD)
for nloop in range(0, URL_POOL_LEN, 1):
for nnloop in range(1, MAX_PAGE_NUM, 1):
url = URL_POOL[nloop]
url = '%s%s%sp%s' % (BASE_URL, GLUE, url, nnloop)
print(62, url)
thread_instance = MyThread(grab_one_url, (url), grab_one_url.__name__)
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(70, t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
# pprint.pprint(res_dic)
deal\_exception\_url\_list()
f\_name = 'dzdp\_基于区-大商圈的餐馆列表-深圳.csv'
f = open(f\_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f\_name, 'a', encoding='utf-8-sig')
str = 'name,mean\_price, flavour, position,address,url,page\_title, page\_Keywords, data\_ga\_index\_1, data\_ga\_index\_2, data\_ga\_index\_3,\\n'
f.write(str)
for url in res\_dic:
page\_title = res\_dic\[url\]\['page\_title'\]
page\_Keywords = res\_dic\[url\]\['page\_Keywords'\]
data\_ga\_index\_1 = res\_dic\[url\]\['data\_ga\_index\_1'\]
data\_ga\_index\_2 = res\_dic\[url\]\['data\_ga\_index\_2'\]
data\_ga\_index\_3 = res\_dic\[url\]\['data\_ga\_index\_3'\]
for d in res\_dic\[url\]\['items\_list'\]:
name = d\['name'\]
mean\_price = d\['mean\_price'\]
flavour = d\['flavour'\]
position = d\['position'\]
address = d\['address'\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
name, mean\_price, flavour, position, address, url, page\_title, page\_Keywords, data\_ga\_index\_1,
data\_ga\_index\_2, data\_ga\_index\_3)
f.write(str)
f.closed
f\_name = 'dzdp\_基于区-大商圈的餐馆列表-深圳\_EXCEPTION\_URL.csv'
f = open(f\_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f\_name, 'a', encoding='utf-8-sig')
for url in exception\_url\_list:
f.write(url + '\\n')
f.closed
if __name__ == '__main__':
main()
#
82000 277
186000 345
42000 251
186000 346
186000 347
42000 252
82000 278
42000 253
42000 254
40000 346
40000 347
42000 255
40000 348
42000 256
40000 349
82000 279
40000 350
sleep
72000 279
12000 350
sleep
72000 280
72000 281
72000 282
96000 274
72000 283
96000 275
186000 348
72000 284
186000 349
106000 275
132000 328
166000 298
188000 372
60000 336
60000 337
60000 338
60000 339
60000 340
82000 280
42000 257
82000 281
82000 282
60000 341
186000 350
sleep
96000 276
72000 285
72000 286
40000 351
72000 287
96000 277
96000 278
72000 288
72000 289
96000 279
72000 290
96000 280
72000 291
96000 281
72000 292
2000 371
96000 282
102000 255
# -*- coding: UTF-8 -*-
import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep
import random
ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed
def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False
def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2
def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception)
return res\_b
def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district
def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list
business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商']
def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
change_key_qps = 0
def change_key():
global touse_key, change_key_qps
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean\_use\_key = random.randint(0, KEY\_POOL\_NUM\_INDICATOR)
for i in range(mean\_use\_key, KEY\_POOL\_NUM\_INDICATOR, 1):
key = KEY\_POOL\_LIST\[i\]
if key == touse\_key:
if i == KEY\_POOL\_NUM\_INDICATOR:
change\_key()
return
else:
continue
touse\_key = key
url = URL\_FOR\_CHANGE\_KEY % (touse\_key)
try:
change\_key\_qps += 1
if change\_key\_qps % QPS == 0:
sleep(TIME\_UNIT)
r = requests.get(url)
json\_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change\_key()
return
infocode = json\_\['infocode'\]
if not infocode == INFOCODE\_OK:
if i == KEY\_POOL\_NUM\_INDICATOR:
sys.exit('NOInvalidKEY')
change\_key()
return
return
# 060101 购物服务 商场 购物中心
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(203, Exception)
# 返回数据解析json异常
return 3
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0
cater_dic = {}
cater_exception_list = []
count_catering = 0
count_catering_exception = 0
coffee_list = []
count_coffee = 0
fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
file_line_list = []
for i in fo:
file_line_list.append(i)
fo.closed
file_line_list_len = len(file_line_list)
file_jump_step_num = 2000
count_catering_exception = 0
count_coffee = 0
count_catering = 0
def get_exception_logic_split_loop(nloop):
global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price
if not chk\_list\_thickness(focus\_list) or not chk\_city\_district(city) or not chk\_city\_district(
district) or not chk\_catering\_kind(catering\_kind):
count\_catering\_exception += 1
cater\_exception\_list.append(dic\_exception)
else:
name = get\_name(name)
m = chk\_is\_coffee(name)
# if m:
# print(list\_)
if not m:
m = chk\_is\_coffee(catering\_kind)
if m:
count\_coffee += 1
coffee\_list.append(dic\_exception)
if not m:
dic\_details = {}
dic\_details\['data\_from'\] = data\_from
dic\_details\['catering\_kind'\] = catering\_kind
dic\_details\['average\_price'\] = average\_price
if\_in\_business\_area = chk\_in\_business\_area(address)
if\_in\_business\_area\_criterion = 'str\_match'
if if\_in\_business\_area == 0:
city\_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start\_line\_count += 1
print( start\_line, start\_line\_count)
if start\_line\_count % QPS == 0:
print('sleep')
sleep(1)
url = URL\_TYPE + '?' + 'key=' + touse\_key + RADIUS + keywords + city\_r + CITYLIMIT
if\_in\_business\_area = fliter\_gd\_business\_area\_type(url)
if\_in\_business\_area\_criterion = 'str\_match+request\_api'
dic\_details\['if\_in\_business\_area\_criterion'\] = if\_in\_business\_area\_criterion
dic\_details\['if\_in\_business\_area'\] = if\_in\_business\_area
if city not in cater\_dic:
cater\_dic\[city\] = {}
if district not in cater\_dic\[city\]:
cater\_dic\[city\]\[district\] = {}
if name not in cater\_dic\[city\]\[district\]:
cater\_dic\[city\]\[district\]\[name\] = {}
if address not in cater\_dic\[city\]\[district\]\[name\]:
cater\_dic\[city\]\[district\]\[name\]\[address\] = {}
cater\_dic\[city\]\[district\]\[name\]\[address\] = dic\_details
count\_catering += 1
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
threads\_list.append(thread\_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads\_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads\_list:
t.join()
f\_name = 'ALL.csv'
f = open(f\_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f\_name, 'a', encoding='utf-8-sig')
str = 'city, district, name, address, if\_in\_business\_area, if\_in\_business\_area\_criterion,catering\_kind, average\_price, data\_from\\n'
f.write(str)
## city,district,address,name,catering\_kind,average\_price,data\_from
count\_write\_rows = 0
for i in cater\_dic:
city = i
if city == '城市':
continue
for ii in cater\_dic\[i\]:
district = ii
for iii in cater\_dic\[i\]\[ii\]:
name = iii
for iv in cater\_dic\[i\]\[ii\]\[iii\]:
address = iv
catering\_kind = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['catering\_kind'\]
average\_price = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['average\_price'\]
if\_in\_business\_area = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area'\]
if\_in\_business\_area\_criterion = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area\_criterion'\]
data\_from = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['data\_from'\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
city, district, name, address, if\_in\_business\_area, if\_in\_business\_area\_criterion,
catering\_kind, average\_price, data\_from)
f.write(str)
count\_write\_rows += 1
f.closed
print(count\_write\_rows)
if __name__ == '__main__':
main()
# -*- coding: UTF-8 -*-
import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep
import random
ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed
def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False
def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2
def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception)
return res\_b
def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district
def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list
business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商']
def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
change_key_qps = 0
def change_key():
global touse_key, change_key_qps
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean\_use\_key = random.randint(0, KEY\_POOL\_NUM\_INDICATOR)
for i in range(mean\_use\_key, KEY\_POOL\_NUM\_INDICATOR, 1):
key = KEY\_POOL\_LIST\[i\]
if key == touse\_key:
if i == KEY\_POOL\_NUM\_INDICATOR:
change\_key()
return
else:
continue
touse\_key = key
url = URL\_FOR\_CHANGE\_KEY % (touse\_key)
try:
change\_key\_qps += 1
if change\_key\_qps % QPS == 0:
sleep(TIME\_UNIT)
r = requests.get(url)
json\_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change\_key()
return
infocode = json\_\['infocode'\]
if not infocode == INFOCODE\_OK:
if i == KEY\_POOL\_NUM\_INDICATOR:
sys.exit('NOInvalidKEY')
change\_key()
return
return
# 060101 购物服务 商场 购物中心
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(203, Exception)
# 返回数据解析json异常
return 3
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0
cater_dic = {}
cater_exception_list = []
count_catering = 0
count_catering_exception = 0
coffee_list = []
count_coffee = 0
fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
file_line_list = []
for i in fo:
file_line_list.append(i)
fo.closed
file_line_list_len = len(file_line_list)
file_jump_step_num = 4000
count_catering_exception = 0
count_coffee = 0
count_catering = 0
def get_exception_logic_split_loop(nloop):
print(247, nloop)
global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price
if not chk\_list\_thickness(focus\_list) or not chk\_city\_district(city) or not chk\_city\_district(
district) or not chk\_catering\_kind(catering\_kind):
count\_catering\_exception += 1
cater\_exception\_list.append(dic\_exception)
else:
name = get\_name(name)
m = chk\_is\_coffee(name)
# if m:
# print(list\_)
if not m:
m = chk\_is\_coffee(catering\_kind)
if m:
count\_coffee += 1
coffee\_list.append(dic\_exception)
if not m:
dic\_details = {}
dic\_details\['data\_from'\] = data\_from
dic\_details\['catering\_kind'\] = catering\_kind
dic\_details\['average\_price'\] = average\_price
if\_in\_business\_area = chk\_in\_business\_area(address)
if\_in\_business\_area\_criterion = 'str\_match'
if if\_in\_business\_area == 0:
city\_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start\_line\_count += 1
print(417, start\_line, start\_line\_count)
if start\_line\_count % QPS == 0:
print('sleep')
sleep(1)
url = URL\_TYPE + '?' + 'key=' + touse\_key + RADIUS + keywords + city\_r + CITYLIMIT
if\_in\_business\_area = fliter\_gd\_business\_area\_type(url)
if\_in\_business\_area\_criterion = 'str\_match+request\_api'
dic\_details\['if\_in\_business\_area\_criterion'\] = if\_in\_business\_area\_criterion
dic\_details\['if\_in\_business\_area'\] = if\_in\_business\_area
if city not in cater\_dic:
cater\_dic\[city\] = {}
if district not in cater\_dic\[city\]:
cater\_dic\[city\]\[district\] = {}
if name not in cater\_dic\[city\]\[district\]:
cater\_dic\[city\]\[district\]\[name\] = {}
if address not in cater\_dic\[city\]\[district\]\[name\]:
cater\_dic\[city\]\[district\]\[name\]\[address\] = {}
cater\_dic\[city\]\[district\]\[name\]\[address\] = dic\_details
count\_catering += 1
class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
print(353, '')
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
print(467, cater\_dic)
f\_name = 'ALL.csv'
f = open(f\_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f\_name, 'a', encoding='utf-8-sig')
str = 'city, district, name, address, if\_in\_business\_area, if\_in\_business\_area\_criterion,catering\_kind, average\_price, data\_from\\n'
f.write(str)
## city,district,address,name,catering\_kind,average\_price,data\_from
count\_write\_rows = 0
for i in cater\_dic:
city = i
if city == '城市':
continue
for ii in cater\_dic\[i\]:
district = ii
for iii in cater\_dic\[i\]\[ii\]:
name = iii
for iv in cater\_dic\[i\]\[ii\]\[iii\]:
address = iv
catering\_kind = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['catering\_kind'\]
average\_price = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['average\_price'\]
if\_in\_business\_area = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area'\]
if\_in\_business\_area\_criterion = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area\_criterion'\]
data\_from = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['data\_from'\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
city, district, name, address, if\_in\_business\_area, if\_in\_business\_area\_criterion,
catering\_kind, average\_price, data\_from)
f.write(str)
count\_write\_rows += 1
f.closed
print(count\_write\_rows)
if __name__ == '__main__':
main()
change_key_qps = 0
def change_key():
global touse_key, change_key_qps
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(TIME_UNIT)
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
print(mean_use_key)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
else:
continue
print(172, 'present_key', touse_key)
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(175, 'touse_key', touse_key)
try:
r = requests.get(url)
try:
json_ = r.json()
except Exception:
print(' r.json()', Exception)
change_key()
except Exception:
print('requests.get(url)', Exception)
change_key()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
2
172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
172 present_key adf8e13d1b170fcef7132ea3178a2d6c
175 touse_key 2f3d41dfbce352fc4d82009c552505fe
172 present_key 2f3d41dfbce352fc4d82009c552505fe
175 touse_key c5ef87ab7efe0d76b970fd330bf9e7f2
172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
175 touse_key 2f3d41dfbce352fc4d82009c552505fe
172 present_key 2f3d41dfbce352fc4d82009c552505fe
175 touse_key c5ef87ab7efe0d76b970fd330bf9e7f2
172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
175 touse_key 6d95ab3f63c494911002c1734089548a
6
172 present_key 6d95ab3f63c494911002c1734089548a
175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
6
172 present_key adf8e13d1b170fcef7132ea3178a2d6c
175 touse_key 6d95ab3f63c494911002c1734089548a
6
172 present_key 6d95ab3f63c494911002c1734089548a
175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
4
172 present_key adf8e13d1b170fcef7132ea3178a2d6c
175 touse_key c0d76e9fa950d0ff1761d56bd78a902e
def change_key():
global touse_key
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
print(mean_use_key)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
print(172, 'present_key', touse_key)
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(175, 'touse_key', touse_key)
try:
r = requests.get(url)
try :
json_ = r.json()
except Exception:
print(' r.json()',Exception)
change_key()
except Exception:
print('requests.get(url)',Exception)
change_key()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
# -*- coding: UTF-8 -*-
import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep
ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed
def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False
def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2
def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception)
return res\_b
def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district
def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list
business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商']
def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
def change_key():
global touse_key
for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(62, 'chk_key', url)
print(62, 'touse_key', touse_key)
try:
r = requests.get(url)
except Exception:
print(Exception)
change_key()
json_ = r.json()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(Exception)
print(195, url)
return 0
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0
cater_dic = {}
cater_exception_list = []
count_catering = 0
count_catering_exception = 0
coffee_list = []
count_coffee = 0
fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
file_line_list = []
for i in fo:
file_line_list.append(i)
fo.closed
file_line_list_len = len(file_line_list)
file_jump_step_num = 10000
count_catering_exception = 0
count_coffee = 0
count_catering = 0
def get_exception_logic_split_loop(nloop):
print(247,nloop)
global touse_key,cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price
if not chk\_list\_thickness(focus\_list) or not chk\_city\_district(city) or not chk\_city\_district(
district) or not chk\_catering\_kind(catering\_kind):
count\_catering\_exception += 1
cater\_exception\_list.append(dic\_exception)
else:
name = get\_name(name)
m = chk\_is\_coffee(name)
# if m:
# print(list\_)
if not m:
m = chk\_is\_coffee(catering\_kind)
if m:
count\_coffee += 1
coffee\_list.append(dic\_exception)
if not m:
dic\_details = {}
dic\_details\['data\_from'\] = data\_from
dic\_details\['catering\_kind'\] = catering\_kind
dic\_details\['average\_price'\] = average\_price
if\_in\_business\_area = chk\_in\_business\_area(address)
if\_in\_business\_area\_criterion = 'str\_match'
if if\_in\_business\_area == 0:
city\_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start\_line\_count += 1
print(417, start\_line,start\_line\_count)
if start\_line\_count%QPS == 0:
print('sleep')
sleep(1)
url = URL\_TYPE + '?' + 'key=' + touse\_key + RADIUS + keywords + city\_r + CITYLIMIT
if\_in\_business\_area = fliter\_gd\_business\_area\_type(url)
if\_in\_business\_area\_criterion = 'str\_match+request\_api'
dic\_details\['if\_in\_business\_area\_criterion'\] = if\_in\_business\_area\_criterion
dic\_details\['if\_in\_business\_area'\] = if\_in\_business\_area
if city not in cater\_dic:
cater\_dic\[city\] = {}
if district not in cater\_dic\[city\]:
cater\_dic\[city\]\[district\] = {}
if name not in cater\_dic\[city\]\[district\]:
cater\_dic\[city\]\[district\]\[name\] = {}
if address not in cater\_dic\[city\]\[district\]\[name\]:
cater\_dic\[city\]\[district\]\[name\]\[address\] = {}
cater\_dic\[city\]\[district\]\[name\]\[address\] = dic\_details
count\_catering += 1
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def main():
print('starting at:',ctime())
threads_list = []
thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop,(nloop),get_exception_logic_split_loop.__name__)
print(353,'')
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
print(467, cater\_dic)
f\_name = 'ALL.csv'
f = open(f\_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f\_name, 'a', encoding='utf-8-sig')
str = 'city, district, name, address, if\_in\_business\_area, if\_in\_business\_area\_criterion,catering\_kind, average\_price, data\_from\\n'
f.write(str)
## city,district,address,name,catering\_kind,average\_price,data\_from
count\_write\_rows = 0
for i in cater\_dic:
city = i
if city == '城市':
continue
for ii in cater\_dic\[i\]:
district = ii
for iii in cater\_dic\[i\]\[ii\]:
name = iii
for iv in cater\_dic\[i\]\[ii\]\[iii\]:
address = iv
catering\_kind = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['catering\_kind'\]
average\_price = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['average\_price'\]
if\_in\_business\_area = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area'\]
if\_in\_business\_area\_criterion = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area\_criterion'\]
data\_from = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['data\_from'\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
city, district, name, address, if\_in\_business\_area, if\_in\_business\_area\_criterion,
catering\_kind, average\_price, data\_from)
f.write(str)
count\_write\_rows += 1
f.closed
print(count\_write\_rows)
if __name__ == '__main__':
main()
while fo.readline() 少统计了??
# -*- coding: UTF-8 -*-
import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep
ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed
def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False
def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2
def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception)
return res\_b
def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district
def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list
business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商']
def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
INFOCODE_OK = '10000'
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true'
URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
def change_key():
global touse_key
for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(62, 'chk_key', url)
print(62, 'touse_key', touse_key)
try:
r = requests.get(url)
except Exception:
print(Exception)
change_key()
json_ = r.json()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(Exception)
print(195, url)
return 0
infocode = r_json['infocode']
if infocode == '10000':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0
cater_dic = {}
cater_exception_list = []
count_catering = 0
count_catering_exception = 0
coffee_list = []
count_coffee = 0
fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r', encoding='gbk')
file_line_list = []
while fo.readline():
file_line_list.append(fo.readline())
file_line_list_len = len(file_line_list)
file_jump_step_num = 5000
count_catering_exception = 0
count_coffee = 0
count_catering = 0
def get_exception_logic_split_loop(nloop):
print(247,nloop)
global touse_key,cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price
if not chk\_list\_thickness(focus\_list) or not chk\_city\_district(city) or not chk\_city\_district(
district) or not chk\_catering\_kind(catering\_kind):
count\_catering\_exception += 1
cater\_exception\_list.append(dic\_exception)
else:
name = get\_name(name)
m = chk\_is\_coffee(name)
# if m:
# print(list\_)
if not m:
m = chk\_is\_coffee(catering\_kind)
if m:
count\_coffee += 1
coffee\_list.append(dic\_exception)
if not m:
dic\_details = {}
dic\_details\['data\_from'\] = data\_from
dic\_details\['catering\_kind'\] = catering\_kind
dic\_details\['average\_price'\] = average\_price
if\_in\_business\_area = chk\_in\_business\_area(address)
if\_in\_business\_area\_criterion = 'str\_match'
if if\_in\_business\_area == 0:
city\_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start\_line\_count += 1
print(417, start\_line,start\_line\_count)
if start\_line\_count%QPS == 0:
print('sleep')
sleep(1)
url = URL\_TYPE + '?' + 'key=' + touse\_key + RADIUS + keywords + city\_r + CITYLIMIT
if\_in\_business\_area = fliter\_gd\_business\_area\_type(url)
if if\_in\_business\_area == 1:
if\_in\_business\_area\_criterion = 'request\_api'
dic\_details\['if\_in\_business\_area\_criterion'\] = if\_in\_business\_area\_criterion
dic\_details\['if\_in\_business\_area'\] = if\_in\_business\_area
if city not in cater\_dic:
cater\_dic\[city\] = {}
if district not in cater\_dic\[city\]:
cater\_dic\[city\]\[district\] = {}
if name not in cater\_dic\[city\]\[district\]:
cater\_dic\[city\]\[district\]\[name\] = {}
if address not in cater\_dic\[city\]\[district\]\[name\]:
cater\_dic\[city\]\[district\]\[name\]\[address\] = {}
cater\_dic\[city\]\[district\]\[name\]\[address\] = dic\_details
count\_catering += 1
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args)
def main():
print('starting at:',ctime())
threads_list = []
thread_sum = math.floor(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop,(nloop),get_exception_logic_split_loop.__name__)
print(353,'123')
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
print(467, cater\_dic)
f\_name = 'ALL.csv'
f = open(f\_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f\_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,是否在商场,菜别(类型),均价,data\_from,\\n'
f.write(str)
## city,district,address,name,catering\_kind,average\_price,data\_from
count\_write\_rows = 0
for i in cater\_dic:
city = i
if city == '城市':
continue
for ii in cater\_dic\[i\]:
district = ii
for iii in cater\_dic\[i\]\[ii\]:
name = iii
for iv in cater\_dic\[i\]\[ii\]\[iii\]:
address = iv
catering\_kind = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['catering\_kind'\]
average\_price = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['average\_price'\]
if\_in\_business\_area = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area'\]
if\_in\_business\_area\_criterion = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['if\_in\_business\_area\_criterion'\]
data\_from = cater\_dic\[i\]\[ii\]\[iii\]\[iv\]\['data\_from'\]
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\\n' % (
city, district, name, address, if\_in\_business\_area, if\_in\_business\_area\_criterion,
catering\_kind, average\_price, data\_from)
f.write(str)
count\_write\_rows += 1
f.closed
print(count\_write\_rows)
if __name__ == '__main__':
main()
手机扫一扫
移动阅读更方便
你可能感兴趣的文章