defget_nodes_list(): pages = requests.get("https://www.9fh.org/special-show-p-1.html").text sel = Selector(text=pages) p1_girls_url = sel.xpath("//div[@class='row placeholders']/div/h4/a/@href").extract() url_list = [] for te in p1_girls_url: url_list.append(parse.urljoin(domin, te)) # all_urls = [] # for ar in p1_girls_url: # all_urls.append(parse.urljoin(domin, ar)) # for tr in all_urls: # if tr not in url_list: # url_list.append(tr) # # next_page = sel.xpath("//ul[@class='pagination']/a[7]/@href").extract() # if next_page: # next_url = parse.urljoin(domin, next_page[0]) # get_nodes_list(next_url)
return url_list
defget_all_url(url): #得到所有url pages = requests.get(url).text sel = Selector(text=pages) urls = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/@href").extract() work_url = [] all_url = [] for tp in urls: all_url.append(parse.urljoin(domin, tp)) # print(all_url) # a = 1 # next_page = sel.xpath("//ul[@class='pagination']/a[4]/@href").extract() # if next_page: # next_url = parse.urljoin(domin, next_page[0]) # get_all_url(next_url) # last_urls = [] # for url in all_urls: # if p1_girls_url not in all_urls: # last_urls.append(p1_girls_url) return all_url
defdemo(): all_urls = [] url_list = get_nodes_list() for url in url_list: all_urls.append(get_all_url(url)) return all_urls
defget_info(last_urls): for single_url in last_urls: for i in single_url: pages = requests.get(i).text sel = Selector(text=pages) # barsize = sel.xpath("//div[@class='col-xs-6 col-md-10 info']/p[5]/text()").extract() # work_name = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/text()").extract() name = sel.xpath("//div[@class='row']/div[1]/h2[1]/a/text()").extract()[0].encode('ISO-8859-1').decode('utf8') fanhao = sel.xpath("//div[@class='info']/p[1]/span[2]/text()").extract()[0].encode('ISO-8859-1').decode('utf8') launch_time = sel.xpath("//div[@class='info']/p[2]/text()").extract()[0] varieties = sel.xpath("//div[@class='info']/p[6]/span/a/text()").extract() types = ','.join(varieties).encode('ISO-8859-1').decode('utf8') work_time = sel.xpath("//div[@class='info']/p[3]/text()").extract() wk = ''.join(work_time).encode('ISO-8859-1').decode('utf8') act = sel.xpath("//div[@class='row placeholders']/div/h4/a/text()").extract() actor = ','.join(act).encode('ISO-8859-1').decode('utf8')
topic = Topic() topic.main_actor = actor topic.fanhao = fanhao topic.varieties = types topic.launch_time = launch_time topic.work_time =wk topic.work_name = name