# -*- coding: utf-8 -*-
import scrapy
import re
from fang.items import NewHouseItem, ESFItem


class SfwSpider(scrapy.Spider):
    name = 'sfw'
    allowed_domains = ['fang.com']
    start_urls = ['https://www.fang.com/SoufunFamily.htm']

    def parse(self, response):
        trs = response.xpath('//div[@class="outCont"]//tr')
        province = None
        for tr in trs:
            tds = tr.xpath('.//td[not(@class)]')
            province_td = tds[0]
            province_text = province_td.xpath('.//text()').get()
            province_text = re.sub(r'\s', '', province_text)
            if province_text == '其它': continue
            if province_text:
                province = province_text
            city_td = tds[1]
            city_links = city_td.xpath('.//a')
            for city_link in city_links:
                city = city_link.xpath('.//text()').get()
                city_url = city_link.xpath('.//@href').get()
                if city == '北京':
                    #构建新房链接
                    url_modole = city_url.split('//')
                    city_id = url_modole[1].split('.')[0]
                    newhouse_url = r'https://' + r'newhouse.fang.com/house/s/'
                    #构建二手房链接
                    esf_url = r'https://' + r'esf1.fang.com'
                else:
                    #构建新房链接
                    url_modole = city_url.split('//')
                    city_id = url_modole[1].split('.')[0]
                    newhouse_url = r'https://' + city_id + r'.newhouse.fang.com/house/s/'
                    #构建二手房链接
                    esf_url = r'https://' + city_id + r'.esf.fang.com'

                yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={'info': (province, city)})

                yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={'info': (province, city)})
                break
            break


    def parse_newhouse(self, response):
        province, city = response.meta.get('info')
        divs = response.xpath('//div[contains(@class, "nl_con")]/ul//div[@class="nlc_details"]')
        for div in divs:
            name = div.xpath('.//div[@class="nlcd_name"]/a/text()').get()
            name = re.sub('\s', '', name)
            price = ' '.join(div.xpath('.//div[@class="nhouse_price"]//text()').getall())
            price = re.sub('\s|广告', '', price)
            rooms = div.xpath('.//div[contains(@class, "house_type")]//a/text()').getall()
            rooms = list(filter(lambda x: '居' in x, rooms))
            area = ''.join(div.xpath('.//div[contains(@class, "house_type")]/text()').getall())
            area = re.sub(r'\s|－|/', '', area)
            address = div.xpath('.//div[@class="address"]/a/@title').get()
            district = ''.join(div.xpath('.//div[@class="address"]/a//text()').getall())
            district = re.search(r'.*\[(.+)\].*', district)
            if district: district = district.group(1)
            sale = div.xpath('.//div[contains(@class, "fangyuan")]/span/text()').get()
            origin_url = 'https:' + div.xpath('.//div[@class="nlcd_name"]/a/@href').get()

            item = NewHouseItem(province=province, city=city, name=name, price=price, rooms=rooms, 
                area=area, address=address, district=district, sale=sale, origin_url=origin_url)
            yield item

        next_page = response.xpath('.//a[@class="next"][last()]/@href').get()
        if next_page:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse_newhouse, meta={'info': (province, city)})


    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath('//div[contains(@class, "shop_list")]//dl[@dataflag="bg"]')
        for dl in dls:
            item = ESFItem(province=province, city=city)
            name = dl.xpath('.//dd/p[@class="add_shop"]/a/@title').get()
            infos = ''.join(dl.xpath('.//dd/p[@class="tel_shop"]//text()').getall())
            infos = re.sub(r'\s', '', infos)
            infos = infos.split('|')
            for info in infos:
                if '层' in info:
                    item['floor'] = info
                elif '厅' in info:
                    item['rooms'] = info
                elif '独栋' in info:
                    item['rooms'] = info
                elif '㎡' in info:
                    item['area'] = info
                elif '向' in info:
                    item['toward'] = info
                elif '年' in info:
                    item['year'] = info.replace('年建', '')
            address = dl.xpath('.//dd/p[@class="add_shop"]/span/text()').get()
            price = ' '.join(dl.xpath('.//dd[@class="price_right"]/span[@class="red"]//text()').getall())
            unit = dl.xpath('.//dd[@class="price_right"]/span[last()]/text()').get()
            origin_url = response.urljoin(dl.xpath('.//h4/a/@href').get())

            item['name'] = name
            item['address'] = address
            item['price'] = price
            item['unit'] = unit
            item['origin_url'] = origin_url
            yield item

        next_page = response.xpath('.//div[@class="page_al"]/p[last()-2]/a/@href').get()
        if next_page:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(url=next_page, callback=self.parse_esf, meta={'info': (province, city)})
