首页 页面

2019-07-13至2019-07-22实习培训(Python)

附python3操作MySQL(增删改查、事务)

import requests
import json
import re
import time
import _thread
import random

# 工资提取
income_filter = re.compile('\d+')

# table head
table_head = '会员ID' + '::' + '昵称' + '::' + '年龄' + '::' + '省市' + '::' + '市区县' + '::' + \
             '星座' + '::' + '学历' + '::' + '学历转分数' + '::' + '工资' + '::' + '婚否' + '::' + \
             '身高' + '::' + '体重' + '::' + '职业' + '::' + '性别' + '::' + '头像' + '\n'
# 默认不存储 表头
write_head = False

# # GET参数
# params = {
#     'ajax': 1,
#     'ageBegin': 0, # default
#     'ageEnd': 100, # default
#     'aim': -1,
#     'marriage': 0,
#     'mode': 4,
#     'order': 8,
#     'province': 0,
#     'city': 0,
#     'district': -1,
#     'sex': 0,
#     'userTag': 0,
#     'vippage': -1,
#     'searchType': 0,
#     'page': 1,
#     'pagesize': 81
# }

# http://yuehui.163.com/searchusersrcm.do?ajax=1&ageBegin=18&ageEnd=25&aim=-1&marriage=0&mode=4&order=8&province=0&\
# city=0&district=-1&sex=0&userTag=0&vippage=-1&searchType=0&page=1&pagesize=81


# 随机浏览器模拟
def random_user_agent():
    user_agent_list = ['Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
                       'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
                       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36']
    return random.choice(user_agent_list)


# 确定参数后 存储某一页所有的数据
def save_a_page_info_by_params(func_url, func_province, func_marriage, func_page, func_sex):

    # GET参数
    func_params = {
        'ajax': 1,
        'ageBegin': 0,  # default
        'ageEnd': 100,  # default
        'aim': -1,
        'marriage': func_marriage,
        'mode': 4,
        'order': 8,
        'province': func_province,
        'city': 0,
        'district': -1,
        'sex': func_sex,
        'userTag': 0,
        'vippage': -1,
        'searchType': 0,
        'page': func_page,
        'pagesize': 81
    }

    # 模拟不同的无痕浏览器
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': '',
        'Host': 'yuehui.163.com',
        'Pragma': 'no-cache',
        'Referer': 'http://yuehui.163.com/searchusers.do?sex=0&province=0&city=0&ageBegin=18&ageEnd=25',
        'User-Agent': random_user_agent()
    }

    try:
        res = requests.get(url=func_url, params=func_params, headers=headers).content.decode('utf-8')

        obj = json.loads(res)

        if obj[0]['total'] < 81:
            time.sleep(0.5)
            print('Page Thread get msg: no more info, stop request')
        else:
            with open('./data/girls', 'a+', encoding='utf-8') as fd:
                # fcntl.flock(fo.fileno(), fcntl.LOCK_EX)  # 加锁
                if write_head:
                    fd.write(table_head)
                for item in obj[0]['list']:
                    temp = ''
                    temp += '%d' % item['id'] + '::'
                    temp += (item['nick'] if item['nick'] != '' else '空') + '::'
                    temp += '%d' % item['age'] + '::'
                    temp += (item['cityName'] if item['cityName'] != '' else '空') + '::'
                    temp += (item['districtName'] if item['districtName'] != '' else '空') + '::'
                    temp += (item['constellationName'] if item['constellationName'] != '' else '空') + '::'

                    # ===================================================================================
                    # 处理学历 为学历打分
                    degree_score = 0
                    degree_temp = (item['degreeName'] if item['degreeName'] != '' else '空') + '::'
                    temp += degree_temp
                    if degree_temp == '高中以下::' or degree_temp == '空::':
                        degree_score = 30
                    elif degree_temp == '高中::':
                        degree_score = 40
                    elif degree_temp == '专科::':
                        degree_score = 50
                    elif degree_temp == '本科::':
                        degree_score = 60
                    elif degree_temp == '硕士::':
                        degree_score = 70
                    elif degree_temp == '博士::':
                        degree_score = 80
                    # 添加打分
                    temp += '%d' % degree_score + '::'
                    # ===================================================================================

                    # ===================================================================================
                    # 处理工资
                    # 工资的可能格式: 2000 以下 | 2000~3000
                    income_computed = -1 # 收入不填写的不参加排序,赋值 -1
                    income_temp = item['incomeName'] if item['incomeName'] != '' else '空'
                    if income_temp == '保密' or income_temp == '空':
                        if income_temp == '保密':
                            income_computed = 0
                    else:
                        income_filter_arr = income_filter.findall(income_temp)
                        if len(income_filter_arr) == 1:
                            income_computed = int(income_filter_arr[0])
                        else:
                            income_computed = (int(income_filter_arr[0]) + int(income_filter_arr[1])) / 2
                    temp += '%d' % income_computed + '::'

                    # ===================================================================================

                    temp += '%d' % item['marriage'] + '::'
                    temp += '%d' % item['stature'] + '::'
                    temp += '%d' % item['avoirdupois'] + '::'
                    temp += (item['industryName'] if item['industryName'] != '' else '空') + '::'
                    temp += '%d' % func_sex + '::'
                    temp += (item['fullPhotoUri'] if item['fullPhotoUri'] != '' else '空') + ''
                    temp += '\n'
                    # 此处没有加锁优化 数据覆盖率很高 但是每条数据不会损坏
                    fd.write(temp)
            fd.close()
            time.sleep(0.5)
            print('Page Thread run done')
    except Exception as e:
        print('[Fail to request]page thread requests Fail: ' + e)


# code start
url = 'http://yuehui.163.com/searchusersrcm.do'

sex = 0
province = 0
marriage = 0
page = 1

while sex <= 1:
    province = 0
    while province <= 33:
        marriage = 0
        while marriage <= 1:
            page = 1
            while page < 15:
                # to do
                # 并发会产生覆盖
                # params['sex'] = sex
                # params['province'] = province
                # params['marriage'] = marriage
                # params['page'] = page
                # save_a_page_info_by_params(url, params)
                # 每页创建一个新线程执行
                try:
                    # _thread.start_new_thread(save_a_page_info_by_params, (url, params, sex, ))
                    _thread.start_new_thread(save_a_page_info_by_params, (url, province, marriage, page, sex,))
                except:
                    print("[sex=%d, province=%d, marriage=%d, page=%d] 无法启动线程" % (sex, province, marriage, page))
                page += 1
                print('Thread start OK for [sex=%d, province=%d, marriage=%d, page=%d]' % (sex, province, marriage, page))
            marriage += 1
        province += 1
        # take a short break
        # time.sleep(3)
    sex += 1

while(True):
    time.sleep(30)



# 会员ID::昵称::年龄::省市::市区县::星座::学历::学历转分数::工资::婚否::身高::体重::职业::性别::头像

评论

Top