#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import logging
import re
from bs4 import BeautifulSoup
from scrapper_helpers.utils import flatten
from olx.utils import city_name, get_content_for_url, get_url
log = logging.getLogger(__file__)
logging.basicConfig(level=logging.DEBUG)
[docs]def get_page_count(markup):
""" Reads total page number from OLX search page
:param markup: OLX search page markup
:type markup: str
:return: Total page number extracted from js script
:rtype: int
"""
html_parser = BeautifulSoup(markup, "html.parser")
scripts = html_parser.head.find_all("script")
metadata_script = None
error_message = "Error no page number found. Please check if it's valid olx page."
for script in scripts:
if "page_count" in script.text:
metadata_script = script.text.split(",")
break
if not metadata_script:
log.warning(error_message)
return 1
for element in metadata_script:
if "page_count" in element:
current = element.split(":")
out = ""
for char in current[len(current) - 1]:
if char.isdigit():
out += char
return int(out)
log.warning(error_message)
return 1
[docs]def get_page_count_for_filters(main_category=None, sub_category=None, detail_category=None, region=None,
search_query=None, url=None, **filters):
""" Reads total page number for given search filters
:param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters.
:param main_category: Main category
:param sub_category: Sub category
:param detail_category: Detail category
:param region: Region of search
:param search_query: Additional search query
:param filters: See :meth category.get_category for reference
:type url: str, None
:type main_category: str, None
:type sub_category: str, None
:type detail_category: str, None
:type region: str, None
:type search_query: str, None
:return: Total page number
:rtype: int
"""
city = city_name(region) if region else None
if url is None:
url = get_url(main_category, sub_category, detail_category, city, search_query, **filters)
response = get_content_for_url(url)
html_parser = BeautifulSoup(response.content, "html.parser")
scripts = html_parser.head.find_all("script")
metadata_script = None
error_message = "Error no page number found. Please check if it's valid olx page."
for script in scripts:
if "page_count" in script.text:
metadata_script = script.text.split(",")
break
if not metadata_script:
log.warning(error_message)
return 1
for element in metadata_script:
if "page_count" in element:
current = element.split(":")
out = ""
for char in current[len(current) - 1]:
if char.isdigit():
out += char
return int(out)
log.warning(error_message)
return 1
[docs]def parse_ads_count(markup):
""" Reads total number of adds
:param markup: OLX search page markup
:type markup: str
:return: Total ads count from script
:rtype: int
"""
html_parser = BeautifulSoup(markup, "html.parser")
scripts = html_parser.find_all('script')
data = ''
for script in scripts:
try:
if "GPT.targeting" in script.string:
data = script.string
break
except TypeError:
continue
try:
data_dict = json.loads((re.split('GPT.targeting = |;', data))[2].replace(";", ""))
except json.JSONDecodeError as e:
logging.info("JSON failed to parse GPT offer attributes. Error: {0}".format(e))
return 0
return int(data_dict.get("ads_count"))
[docs]def parse_offer_url(markup, today=None, yesterday=None):
""" Searches for offer links in markup
Offer links on OLX are in class "linkWithHash".
Only www.olx.pl domain is whitelisted.
:param markup: Search page markup
:param today: Should search for offers posted yesterday?
:param yesterday: Should search for offers posted yesterday?
:type today: object
:type yesterday: object
:type markup: str
:return: Url with offer
:rtype: str
"""
today_str = "dzisiaj"
yesterday_str = "wczoraj"
html_parser = BeautifulSoup(markup, "html.parser")
date_added = html_parser.find(class_='color-9 lheight16 marginbott5 x-normal')
date_added = date_added.text.strip() if date_added else None
offer_link = html_parser.find('a')
if not offer_link:
return
url = offer_link.attrs['href']
if not today and not yesterday:
return url
if (today or yesterday) and (yesterday_str not in date_added and today_str not in date_added):
return
if today and not yesterday and yesterday_str in date_added:
return
if yesterday and not today and today_str in date_added:
return
return url
[docs]def parse_available_offers(markup, today=None, yesterday=None):
""" Collects all offer links on search page markup
:param markup: Search page markup
:param today: Should search for offers posted yesterday?
:param yesterday: Should search for offers posted yesterday?
:type today: object
:type yesterday: object
:type markup: str
:return: Links to offer on given search page
:rtype: list
"""
html_parser = BeautifulSoup(markup, "html.parser")
not_found = html_parser.find(class_="emptynew")
if not_found is not None:
log.warning("No offers found")
return
ads_count = parse_ads_count(markup)
offers = html_parser.find_all(class_='offer')
if len(offers) == 0:
offers = html_parser.select("li.wrap.tleft")
parsed_offers = [parse_offer_url(str(offer), today, yesterday) for offer in offers if offer][:ads_count]
return parsed_offers
[docs]def get_category(main_category=None, sub_category=None, detail_category=None, region=None, search_query=None, url=None,
**filters):
""" Parses available offer urls from given category from every page
:param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters.
:param main_category: Main category
:param sub_category: Sub category
:param detail_category: Detail category
:param region: Region of search
:param search_query: Additional search query
:param filters: Dictionary with additional filters. Following example dictionary contains every possible filter
with examples of it's values.
:Example:
input_dict = {
"[filter_float_price:from]": 2000, # minimal price
"[filter_float_price:to]": 3000, # maximal price
"[filter_enum_floor_select][0]": 3, # desired floor, enum: from -1 to 11 (10 and more) and 17 (attic)
"[filter_enum_furniture][0]": True, # furnished or unfurnished offer
"[filter_enum_builttype][0]": "blok", # valid build types:
# blok, kamienica, szeregowiec, apartamentowiec, wolnostojacy, loft
"[filter_float_m:from]": 25, # minimal surface
"[filter_float_m:to]": 50, # maximal surface
"[filter_enum_rooms][0]": 2, # desired number of rooms, enum: from 1 to 4 (4 and more)
"today": True, # Should search for offer posted today?
"yesterday": True, # Should search for offers posted yesterday?
}
:type url: str, None
:type main_category: str, None
:type sub_category: str, None
:type detail_category: str, None
:type region: str, None
:type search_query: str, None
:type filters: dict
:return: List of all offers for given parameters
:rtype: list
"""
parsed_content, page, start_url = [], 0, None
city = city_name(region) if region else None
if url is None:
url = get_url(main_category, sub_category, detail_category, city, search_query, **filters)
else:
start_url = url
response = get_content_for_url(url)
page_max = get_page_count(response.content)
while page < page_max:
if start_url is None:
url = get_url(main_category, sub_category, detail_category, city, search_query, page, **filters)
else:
url = get_url(page=page, user_url=start_url, **filters)
log.debug(url)
response = get_content_for_url(url)
log.info("Loaded page {0} of offers".format(page))
offers = parse_available_offers(response.content, filters.get('today'), filters.get('yesterday'))
if offers is None:
break
parsed_content.append(offers)
page += 1
parsed_content = [offer for offer in list(flatten(parsed_content)) if offer]
log.info("Loaded {0} offers".format(str(len(parsed_content))))
return parsed_content
[docs]def get_offers_for_page(page, main_category=None, sub_category=None, detail_category=None, region=None,
search_query=None, url=None, **filters):
""" Parses offers for one specific page of given category with filters.
:param page: Page number
:param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters.
:param main_category: Main category
:param sub_category: Sub category
:param detail_category: Detail category
:param region: Region of search
:param filters: See :meth category.get_category for reference
:type page: int
:type url: str, None
:type main_category: str, None
:type sub_category: str, None
:type detail_category: str, None
:type region: str, None
:type search_query: str, None
:type filters: dict
:return: List of all offers for given page and parameters
:rtype: list
"""
city = city_name(region) if region else None
if url is None:
url = get_url(main_category, sub_category, detail_category, city, search_query, page=page, **filters)
else:
url = get_url(page=page, user_url=url, **filters)
response = get_content_for_url(url)
log.info("Loaded page {0} of offers".format(page))
offers = [offer for offer in parse_available_offers(response.content) if offer]
log.info("Loaded {0} offers".format(str(len(offers))))
return offers