1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
| import requests from lxml import etree import time header = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.69" }
def get_detail(url): ''' 产权性质 //*[@id="houseInfo"]/table/tbody/tr[1]/td[2]/span[2]/text() 产权年限 //*[@id="houseInfo"]/table/tbody/tr[2]/td[2]/span[2]/text() 房本年限 //*[@id="houseInfo"]/table/tbody/tr[2]/td[3]/span[2]/text() ''' time.sleep(0.3) content = requests.get(url=url,headers=header).text tree = etree.HTML(content) try: char = tree.xpath('//*[@id="houseInfo"]/table/tbody/tr[1]/td[2]/span[2]/text()')[0] except: char = None try: property_right_years = tree.xpath('//*[@id="houseInfo"]/table/tbody/tr[2]/td[2]/span[2]/text()')[0] except: property_right_years = None try: room_years = tree.xpath('//*[@id="houseInfo"]/table/tbody/tr[2]/td[3]/span[2]/text()')[0] except: room_years = None res = f'{char},{property_right_years},{room_years}' return res
def data_cat(root): res = '' ls = root.xpath('./span/text()') for i in ls: res+=i return res
def get_data(i,fp): print(f'=======正在爬取第{i}页=========') time.sleep(1.2) url = f'https://rizhao.anjuke.com/sale/p{i}/?from=HomePage_TopBar'
contents = requests.get(url=url,headers=header).text ''' 标题 //h3[@class="property-content-title-name"] 价格 //p[@class="property-price-total"]/span[1]/text() 几室几厅 //p[@class="property-content-info-text property-content-info-attribute"] 面积 //p[@class="property-content-info-text"][1] 朝向 //p[@class="property-content-info-text"][2] 楼层 //p[@class="property-content-info-text"][3] 建造年份 //p[@class="property-content-info-text"][4] 地址 //p[@class="property-content-info-comm-address"] 联系人 //span[@class="property-extra-text"][1] 链接 //div[@class="property"]/a/@href ''' tree = etree.HTML(contents) titles = tree.xpath('//h3[@class="property-content-title-name"]/text()') prices = tree.xpath('//p[@class="property-price-total"]/span[1]/text()') rooms = tree.xpath('//p[@class="property-content-info-text property-content-info-attribute"]') areas = tree.xpath('//p[@class="property-content-info-text"][1]/text()') forward = tree.xpath('//p[@class="property-content-info-text"][2]/text()') floor = tree.xpath('//p[@class="property-content-info-text"][3]/text()') years = tree.xpath('//p[@class="property-content-info-text"][4]/text()') address = tree.xpath('//p[@class="property-content-info-comm-address"]') persons = tree.xpath('//span[@class="property-extra-text"][1]/text()') links = tree.xpath('//div[@class="property"]/a/@href') for i in range(len(titles)): print(f'第{i}个') print(f'{titles[i].strip()},{prices[i].strip()},{data_cat(rooms[i]).strip()},{areas[i].strip()},{forward[i].strip()},{floor[i].strip()},{years[i].strip()},{data_cat(address[i]).strip()},{persons[i].strip()},{get_detail(links[i].strip())}',file=fp) get_detail(links[i].strip()) if __name__ == '__main__': with open('rizhao.txt','a+',encoding='utf-8') as fp: for i in range(1,11): get_data(i,fp)
|