Skip to content

Commit 8c6ec57

Browse files
author
lenarsaitov
committed
add house_number parsing
1 parent 1f34993 commit 8c6ec57

File tree

4 files changed

+68
-48
lines changed

4 files changed

+68
-48
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ Collecting information from pages with list of announcements
4747
'district': 'Замоскворечье',
4848
'underground': 'Новокузнецкая',
4949
'street': 'Космодамианская набережная',
50+
'house_number': 25,
5051
'floor': 5,
5152
'floors_count': 12,
5253
'total_meters': 85.0,
@@ -90,6 +91,7 @@ Total number of parced announcements: 56. Average price per month: 236 426 rub
9091
* __district__ - район
9192
* __underground__ - метро
9293
* __street__ - улица
94+
* __house_number__ - номер дома
9395
* __floor__ - этаж
9496
* __floors_count__ - общее количество этажей
9597
* __total_meters__ - общая площадь

cianparser/parser.py

Lines changed: 64 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ def build_url(self):
9696
if self.is_by_homeowner:
9797
url += IS_ONLY_HOMEOWNER
9898

99-
10099
return url
101100

102101
def load_page(self, number_page=1):
@@ -146,10 +145,11 @@ def parse_page(self, html: str, number_page: int, count_of_pages: int, attempt_n
146145
if not self.is_express_mode:
147146
time.sleep(4)
148147

149-
total_planed_announcements = len(offers)*count_of_pages
148+
total_planed_announcements = len(offers) * count_of_pages
150149

151150
print(f"\r {number_page - self.start_page + 1} | {number_page} page with list: [" + "=>" * (
152-
ind + 1) + " " * (len(offers) - ind - 1) + "]" + f" {math.ceil((ind + 1) * 100 / len(offers))}" + "%" +
151+
ind + 1) + " " * (
152+
len(offers) - ind - 1) + "]" + f" {math.ceil((ind + 1) * 100 / len(offers))}" + "%" +
153153
f" | Count of all parsed: {self.parsed_announcements_count}."
154154
f" Progress ratio: {math.ceil(self.parsed_announcements_count * 100 / total_planed_announcements)} %."
155155
f" Average price: {'{:,}'.format(int(self.average_price)).replace(',', ' ')} rub", end="\r",
@@ -204,8 +204,8 @@ def parse_page_offer(self, html_offer):
204204
try:
205205
contact_data = soup_offer_page.select("div[data-name='OfferContactsAside']")[0].text
206206
if "+7" in contact_data:
207-
page_data["phone"] = (contact_data[contact_data.find("+7"):contact_data.find("+7") + 16]).\
208-
replace(" ", "").\
207+
page_data["phone"] = (contact_data[contact_data.find("+7"):contact_data.find("+7") + 16]). \
208+
replace(" ", ""). \
209209
replace("-", "")
210210
except:
211211
pass
@@ -328,8 +328,8 @@ def parse_page_offer_json(self, html_offer):
328328
page_data["floors_count"] = int(ints[1])
329329

330330
if "+7" in html_offer:
331-
page_data["phone"] = html_offer[html_offer.find("+7"): html_offer.find("+7") + 16].split('"')[0].\
332-
replace(" ", "").\
331+
page_data["phone"] = html_offer[html_offer.find("+7"): html_offer.find("+7") + 16].split('"')[0]. \
332+
replace(" ", ""). \
333333
replace("-", "")
334334

335335
return page_data
@@ -393,6 +393,7 @@ def define_location_data(self, block):
393393
location_data = dict()
394394
location_data["district"] = ""
395395
location_data["street"] = ""
396+
location_data["house_number"] = ""
396397
location_data["underground"] = ""
397398

398399
if self.is_sale():
@@ -415,6 +416,11 @@ def define_location_data(self, block):
415416
if "," in location_data["underground"]:
416417
location_data["underground"] = location_data["underground"].split(",")[0]
417418

419+
if (any(chr.isdigit() for chr in address_elements[-1]) and "жк" not in address_elements[-1].lower() and
420+
not any(street_type in address_elements[-1].lower() for street_type in STREET_TYPES)) and len(
421+
address_elements[-1]) < 10:
422+
location_data["house_number"] = address_elements[-1].strip()
423+
418424
for ind, elem in enumerate(address_elements):
419425
if "р-н" in elem:
420426
district = elem.replace("р-н", "").strip()
@@ -438,9 +444,10 @@ def define_location_data(self, block):
438444
location_data["street"] = address_elements[-2].strip()
439445
if street_type == "улица":
440446
location_data["street"] = location_data["street"].replace("улица", "")
447+
441448
return location_data
442449

443-
for after_district_address_element in address_elements[ind + 1:]:
450+
for k, after_district_address_element in enumerate(address_elements[ind + 1:]):
444451
if len(list(set(after_district_address_element.split(" ")).intersection(
445452
NOT_STREET_ADDRESS_ELEMENTS))) != 0:
446453
continue
@@ -449,6 +456,7 @@ def define_location_data(self, block):
449456
continue
450457

451458
location_data["street"] = after_district_address_element.strip()
459+
452460
return location_data
453461

454462
return location_data
@@ -471,6 +479,13 @@ def define_location_data(self, block):
471479
if "ЖК" in address_elements[-2]:
472480
location_data["residential_complex"] = address_elements[-2].strip()
473481

482+
if (any(chr.isdigit() for chr in address_elements[-1]) and "жк" not in address_elements[
483+
-1].lower() and
484+
not any(
485+
street_type in address_elements[-1].lower() for street_type in STREET_TYPES)) and len(
486+
address_elements[-1]) < 10:
487+
location_data["house_number"] = address_elements[-1].strip()
488+
474489
for street_type in STREET_TYPES:
475490
if street_type in address_elements[-1]:
476491
location_data["street"] = address_elements[-1].strip()
@@ -491,6 +506,13 @@ def define_location_data(self, block):
491506
if len(address_elements) < 3:
492507
continue
493508

509+
if (any(chr.isdigit() for chr in address_elements[-1]) and "жк" not in address_elements[
510+
-1].lower() and
511+
not any(
512+
street_type in address_elements[-1].lower() for street_type in STREET_TYPES)) and len(
513+
address_elements[-1]) < 10:
514+
location_data["house_number"] = address_elements[-1].strip()
515+
494516
if street_type in address_elements[-1]:
495517
location_data["street"] = address_elements[-1].strip()
496518
if street_type == "улица":
@@ -523,10 +545,12 @@ def define_price_data(self, block):
523545
for element in elements:
524546
if "₽/мес" in element.text:
525547
price_description = element.text
526-
price_data["price_per_month"] = int("".join(price_description[:price_description.find("₽/мес") - 1].split()))
548+
price_data["price_per_month"] = int(
549+
"".join(price_description[:price_description.find("₽/мес") - 1].split()))
527550

528551
if "%" in price_description:
529-
price_data["commissions"] = int(price_description[price_description.find("%") - 2:price_description.find("%")].replace(" ", ""))
552+
price_data["commissions"] = int(
553+
price_description[price_description.find("%") - 2:price_description.find("%")].replace(" ", ""))
530554

531555
return price_data
532556

@@ -539,51 +563,40 @@ def define_price_data(self, block):
539563
return price_data
540564

541565
def define_specification_data(self, block):
566+
specification_data = dict()
567+
specification_data["floor"] = -1
568+
specification_data["floors_count"] = -1
569+
specification_data["rooms_count"] = -1
570+
specification_data["total_meters"] = -1
571+
542572
title = block.select("div[data-name='LinkArea']")[0].select("div[data-name='GeneralInfoSectionRowComponent']")[
543573
0].text
544574

545575
common_properties = block.select("div[data-name='LinkArea']")[0]. \
546576
select("div[data-name='GeneralInfoSectionRowComponent']")[0].text
547577

548-
total_meters = None
549578
if common_properties.find("м²") is not None:
550579
total_meters = title[: common_properties.find("м²")].replace(",", ".")
551580
if len(re.findall(FLOATS_NUMBERS_REG_EXPRESSION, total_meters)) != 0:
552-
total_meters = float(re.findall(FLOATS_NUMBERS_REG_EXPRESSION, total_meters)[-1].replace(" ", "").replace("-", ""))
553-
else:
554-
total_meters = -1
581+
specification_data["total_meters"] = float(
582+
re.findall(FLOATS_NUMBERS_REG_EXPRESSION, total_meters)[-1].replace(" ", "").replace("-", ""))
555583

556584
if "этаж" in common_properties:
557585
floor_per = common_properties[common_properties.rfind("этаж") - 7: common_properties.rfind("этаж")]
586+
floor_properties = floor_per.split("/")
558587

559-
floor_per = floor_per.split("/")
588+
if len(floor_properties) == 2:
589+
ints = re.findall(r'\d+', floor_properties[0])
590+
if len(ints) != 0:
591+
specification_data["floor"] = int(ints[-1])
560592

561-
if len(floor_per) == 0:
562-
floor, floors_count = -1, -1
563-
else:
564-
floor, floors_count = floor_per[0], floor_per[1]
593+
ints = re.findall(r'\d+', floor_properties[1])
594+
if len(ints) != 0:
595+
specification_data["floors_count"] = int(ints[-1])
565596

566-
ints = re.findall(r'\d+', floor)
567-
if len(ints) == 0:
568-
floor = -1
569-
else:
570-
floor = int(ints[-1])
597+
specification_data["rooms_count"] = define_rooms_count(common_properties)
571598

572-
ints = re.findall(r'\d+', floors_count)
573-
if len(ints) == 0:
574-
floors_count = -1
575-
else:
576-
floors_count = int(ints[-1])
577-
else:
578-
floors_count = -1
579-
floor = -1
580-
581-
return {
582-
"floor": floor,
583-
"floors_count": floors_count,
584-
"rooms_count": define_rooms_count(common_properties),
585-
"total_meters": total_meters,
586-
}
599+
return specification_data
587600

588601
def parse_block(self, block):
589602
common_data = dict()
@@ -597,7 +610,8 @@ def parse_block(self, block):
597610
price_data = self.define_price_data(block=block)
598611
specification_data = self.define_specification_data(block=block)
599612

600-
if self.is_by_homeowner and (author_data["author_type"] != "unknown" and author_data["author_type"] != "homeowner"):
613+
if self.is_by_homeowner and (
614+
author_data["author_type"] != "unknown" and author_data["author_type"] != "homeowner"):
601615
return
602616

603617
if self.is_latin:
@@ -641,19 +655,22 @@ def parse_block(self, block):
641655

642656
specification_data["price_per_m2"] = float(0)
643657
if "price" in price_data:
644-
self.average_price = (self.average_price*self.parsed_announcements_count + price_data["price"])/(self.parsed_announcements_count+1)
645-
price_data["price_per_m2"] = int(float(price_data["price"])/specification_data["total_meters"])
658+
self.average_price = (self.average_price * self.parsed_announcements_count + price_data["price"]) / (
659+
self.parsed_announcements_count + 1)
660+
price_data["price_per_m2"] = int(float(price_data["price"]) / specification_data["total_meters"])
646661
elif "price_per_month" in price_data:
647-
self.average_price = (self.average_price*self.parsed_announcements_count + price_data["price_per_month"])/(self.parsed_announcements_count+1)
648-
price_data["price_per_m2"] = int(float(price_data["price_per_month"])/specification_data["total_meters"])
662+
self.average_price = (self.average_price * self.parsed_announcements_count + price_data[
663+
"price_per_month"]) / (self.parsed_announcements_count + 1)
664+
price_data["price_per_m2"] = int(float(price_data["price_per_month"]) / specification_data["total_meters"])
649665

650666
self.parsed_announcements_count += 1
651667

652668
if define_id_url(common_data["link"]) in self.result_parsed:
653669
return
654670

655671
self.result_parsed.add(define_id_url(common_data["link"]))
656-
self.result.append(self.union(author_data, common_data, specification_data, price_data, page_data, location_data))
672+
self.result.append(
673+
self.union(author_data, common_data, specification_data, price_data, page_data, location_data))
657674

658675
if self.is_saving_csv:
659676
self.save_results()
@@ -705,7 +722,8 @@ def save_results(self):
705722

706723
def load_and_parse_page(self, number_page, count_of_pages, attempt_number):
707724
html = self.load_page(number_page=number_page)
708-
return self.parse_page(html=html, number_page=number_page, count_of_pages=count_of_pages, attempt_number=attempt_number)
725+
return self.parse_page(html=html, number_page=number_page, count_of_pages=count_of_pages,
726+
attempt_number=attempt_number)
709727

710728
def run(self):
711729
print(f"\n{' ' * 30}Preparing to collect information from pages..")

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = cianparser
3-
version = 0.4.22
3+
version = 0.4.23
44
description = Parser information from Cian website
55
url = https://github.com/lenarsaitov/cianparser
66
author = Lenar Saitov

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name='cianparser',
9-
version='0.4.22',
9+
version='0.4.23',
1010
description='Parser information from Cian website',
1111
url='https://github.com/lenarsaitov/cianparser',
1212
author='Lenar Saitov',

0 commit comments

Comments
 (0)