#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import logging
import re
from bs4 import BeautifulSoup
from olx.utils import get_content_for_url
try:
from __builtin__ import unicode
except ImportError:
unicode = lambda x, *args: x
log = logging.getLogger(__file__)
[docs]def get_title(offer_markup):
""" Searches for offer title on offer page
:param offer_markup: Class "offerbody" from offer page markup
:type offer_markup: str
:return: Title of offer
:rtype: str
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
return html_parser.h1.text.replace("\n", "").replace(" ", "")
[docs]def parse_price(offer_markup):
""" Searches for price on offer page
:param offer_markup: Head from offer page
:type offer_markup: str
:return: Tuple of int price and it's currency
:rtype: tuple
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
script = html_parser.find('script').next_sibling.next_sibling.next_sibling.text
data_dict = json.loads(re.split("pageView|;", script)[3].replace('":{', "{").replace("}}'", "}"))
return int(data_dict["ad_price"]), data_dict["price_currency"]
[docs]def get_additional_rent(offer_markup):
""" Searches for additional rental costs
:param offer_markup:
:type offer_markup: str
:return: Additional rent
:rtype: int
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
table = html_parser.find_all(class_="item")
for element in table:
if "Czynsz" in element.text:
return int(("".join(re.findall(r'\d+', element.text))))
return
[docs]def get_gps(offer_markup):
""" Searches for gps coordinates (latitude and longitude)
:param offer_markup: Class "offerbody" from offer page markup
:type offer_markup: str
:return: Tuple of gps coordinates
:rtype: tuple
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
gps_lat = html_parser.find(class_="mapcontainer").attrs['data-lat']
gps_lon = html_parser.find(class_="mapcontainer").attrs['data-lon']
return gps_lat, gps_lon
[docs]def get_poster_name(offer_markup):
""" Searches for poster name
:param offer_markup: Class "offerbody" from offer page markup
:type offer_markup: str
:return: Poster name
:rtype: str
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
return html_parser.h4.text.replace("\n", "").replace(" ", "")
[docs]def get_surface(offer_markup):
""" Searches for surface in offer markup
:param offer_markup: Class "offerbody" from offer page markup
:type offer_markup: str
:return: Surface
:rtype: float
:except: When there is no offer surface it will return None
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
try:
surface = html_parser.sup.parent.text
except AttributeError as e:
log.debug(e)
return None
return float(surface.replace(" m2", "").replace("\t", "").replace("\n", "").replace(",", "."))
[docs]def parse_description(offer_markup):
""" Searches for description if offer markup
:param offer_markup: Body from offer page markup
:type offer_markup: str
:return: Description of offer
:rtype: str
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
return html_parser.find(id="textContent").text.replace(" ", "").replace("\n", " ").replace("\r", "")
[docs]def get_img_url(offer_markup):
""" Searches for images in offer markup
:param offer_markup: Class "offerbody" from offer page markup
:type offer_markup: str
:return: Images of offer in list
:rtype: list
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
images = html_parser.find_all(class_="bigImage")
output = []
for img in images:
output.append(img.attrs["src"])
return output
[docs]def get_date_added(offer_markup):
""" Searches of date of adding offer
:param offer_markup: Class "offerbody" from offer page markup
:type offer_markup: str
:return: Date of adding offer
:rtype: str
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
date = html_parser.find(class_="offer-titlebox__details").em.contents
if len(date) > 4:
date = date[4]
else:
date = date[0]
return date.replace("Dodane", "").replace("\n", "").replace(" ", "").replace("o ", "").replace(", ", " ")
[docs]def parse_region(offer_markup):
""" Parses region information
:param offer_markup:
:return:
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
region = html_parser.find(class_="show-map-link").text
return region.replace(", ", ",").split(",")
[docs]def parse_flat_data(offer_markup):
""" Parses flat data from script of Google Tag Manager
Data includes if offer private or business, number of floor, number of rooms, built type and furniture.
:param offer_markup: Body from offer page markup
:type offer_markup: str
:return: Dictionary of flat data
:rtype: dict
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
scripts = html_parser.find_all('script')
for script in scripts:
if "GPT.targeting" in script.string:
data = script.string
break
data_dict = json.loads((re.split('GPT.targeting = |;', data))[3].replace(";", ""))
translate = {"one": 1, "two": 2, "three": 3, "four": 4}
rooms = data_dict.get("rooms", None)
if rooms is not None:
rooms = translate[rooms[0]]
floor = data_dict.get("floor_select", [None])[0]
if floor is not None:
floor = int(floor.replace("floor_", ""))
return {
"private_business": data_dict.get("private_business", None),
"floor": floor,
"rooms": rooms,
"builttype": data_dict.get("builttype", [None])[0],
"furniture": data_dict.get("furniture", [None])[0] == 'yes'
}
[docs]def parse_offer(markup, url):
""" Parses data from offer page markup
:param markup: Offer page markup
:param url: Url of current offer page
:type markup: str
:type url: str
:return: Dictionary with all offer details
:rtype: dict
"""
html_parser = BeautifulSoup(markup, "html.parser")
offer_content = str(html_parser.body)
offer_price = parse_price(str(html_parser.head))
offer_data = parse_flat_data(offer_content)
gps_coordinates = get_gps(offer_content)
offer_content = str(html_parser.find(class_='offerbody'))
data_keys = list(offer_data.keys())
data_values = list(offer_data.values())
region = parse_region(offer_content)
return {
"title": get_title(offer_content),
"price": offer_price[0],
"additional_rent": get_additional_rent(offer_content),
"currency": offer_price[1],
"city": region[0],
"district": region[2],
"voivodeship": region[1],
"gps": gps_coordinates,
"surface": get_surface(offer_content),
# **offer_data,
data_keys[0]: data_values[0],
data_keys[1]: data_values[1],
data_keys[2]: data_values[2],
data_keys[3]: data_values[3],
data_keys[4]: data_values[4],
"description": parse_description(offer_content),
"poster_name": get_poster_name(offer_content),
"url": url,
"date": get_date_added(offer_content),
"images": get_img_url(offer_content)
}
[docs]def get_descriptions(parsed_urls):
""" Parses details of offers in category
:param parsed_urls: List of offers urls
:type parsed_urls: list
:return: List of details of offers
:rtype: list
:except: If this offer is not available anymore
"""
descriptions = []
for url in parsed_urls:
if url is None:
continue
response = get_content_for_url(url)
try:
descriptions.append(parse_offer(response.content, url))
except AttributeError as e:
log.info("This offer is not available anymore.")
log.debug("Not found: {0} Error: {1}".format(url, e))
return descriptions