@@ -96,7 +96,6 @@ def build_url(self):
96
96
if self .is_by_homeowner :
97
97
url += IS_ONLY_HOMEOWNER
98
98
99
-
100
99
return url
101
100
102
101
def load_page (self , number_page = 1 ):
@@ -146,10 +145,11 @@ def parse_page(self, html: str, number_page: int, count_of_pages: int, attempt_n
146
145
if not self .is_express_mode :
147
146
time .sleep (4 )
148
147
149
- total_planed_announcements = len (offers )* count_of_pages
148
+ total_planed_announcements = len (offers ) * count_of_pages
150
149
151
150
print (f"\r { number_page - self .start_page + 1 } | { number_page } page with list: [" + "=>" * (
152
- ind + 1 ) + " " * (len (offers ) - ind - 1 ) + "]" + f" { math .ceil ((ind + 1 ) * 100 / len (offers ))} " + "%" +
151
+ ind + 1 ) + " " * (
152
+ len (offers ) - ind - 1 ) + "]" + f" { math .ceil ((ind + 1 ) * 100 / len (offers ))} " + "%" +
153
153
f" | Count of all parsed: { self .parsed_announcements_count } ."
154
154
f" Progress ratio: { math .ceil (self .parsed_announcements_count * 100 / total_planed_announcements )} %."
155
155
f" Average price: { '{:,}' .format (int (self .average_price )).replace (',' , ' ' )} rub" , end = "\r " ,
@@ -204,8 +204,8 @@ def parse_page_offer(self, html_offer):
204
204
try :
205
205
contact_data = soup_offer_page .select ("div[data-name='OfferContactsAside']" )[0 ].text
206
206
if "+7" in contact_data :
207
- page_data ["phone" ] = (contact_data [contact_data .find ("+7" ):contact_data .find ("+7" ) + 16 ]).\
208
- replace (" " , "" ).\
207
+ page_data ["phone" ] = (contact_data [contact_data .find ("+7" ):contact_data .find ("+7" ) + 16 ]). \
208
+ replace (" " , "" ). \
209
209
replace ("-" , "" )
210
210
except :
211
211
pass
@@ -328,8 +328,8 @@ def parse_page_offer_json(self, html_offer):
328
328
page_data ["floors_count" ] = int (ints [1 ])
329
329
330
330
if "+7" in html_offer :
331
- page_data ["phone" ] = html_offer [html_offer .find ("+7" ): html_offer .find ("+7" ) + 16 ].split ('"' )[0 ].\
332
- replace (" " , "" ).\
331
+ page_data ["phone" ] = html_offer [html_offer .find ("+7" ): html_offer .find ("+7" ) + 16 ].split ('"' )[0 ]. \
332
+ replace (" " , "" ). \
333
333
replace ("-" , "" )
334
334
335
335
return page_data
@@ -393,6 +393,7 @@ def define_location_data(self, block):
393
393
location_data = dict ()
394
394
location_data ["district" ] = ""
395
395
location_data ["street" ] = ""
396
+ location_data ["house_number" ] = ""
396
397
location_data ["underground" ] = ""
397
398
398
399
if self .is_sale ():
@@ -415,6 +416,11 @@ def define_location_data(self, block):
415
416
if "," in location_data ["underground" ]:
416
417
location_data ["underground" ] = location_data ["underground" ].split ("," )[0 ]
417
418
419
+ if (any (chr .isdigit () for chr in address_elements [- 1 ]) and "жк" not in address_elements [- 1 ].lower () and
420
+ not any (street_type in address_elements [- 1 ].lower () for street_type in STREET_TYPES )) and len (
421
+ address_elements [- 1 ]) < 10 :
422
+ location_data ["house_number" ] = address_elements [- 1 ].strip ()
423
+
418
424
for ind , elem in enumerate (address_elements ):
419
425
if "р-н" in elem :
420
426
district = elem .replace ("р-н" , "" ).strip ()
@@ -438,9 +444,10 @@ def define_location_data(self, block):
438
444
location_data ["street" ] = address_elements [- 2 ].strip ()
439
445
if street_type == "улица" :
440
446
location_data ["street" ] = location_data ["street" ].replace ("улица" , "" )
447
+
441
448
return location_data
442
449
443
- for after_district_address_element in address_elements [ind + 1 :]:
450
+ for k , after_district_address_element in enumerate ( address_elements [ind + 1 :]) :
444
451
if len (list (set (after_district_address_element .split (" " )).intersection (
445
452
NOT_STREET_ADDRESS_ELEMENTS ))) != 0 :
446
453
continue
@@ -449,6 +456,7 @@ def define_location_data(self, block):
449
456
continue
450
457
451
458
location_data ["street" ] = after_district_address_element .strip ()
459
+
452
460
return location_data
453
461
454
462
return location_data
@@ -471,6 +479,13 @@ def define_location_data(self, block):
471
479
if "ЖК" in address_elements [- 2 ]:
472
480
location_data ["residential_complex" ] = address_elements [- 2 ].strip ()
473
481
482
+ if (any (chr .isdigit () for chr in address_elements [- 1 ]) and "жк" not in address_elements [
483
+ - 1 ].lower () and
484
+ not any (
485
+ street_type in address_elements [- 1 ].lower () for street_type in STREET_TYPES )) and len (
486
+ address_elements [- 1 ]) < 10 :
487
+ location_data ["house_number" ] = address_elements [- 1 ].strip ()
488
+
474
489
for street_type in STREET_TYPES :
475
490
if street_type in address_elements [- 1 ]:
476
491
location_data ["street" ] = address_elements [- 1 ].strip ()
@@ -491,6 +506,13 @@ def define_location_data(self, block):
491
506
if len (address_elements ) < 3 :
492
507
continue
493
508
509
+ if (any (chr .isdigit () for chr in address_elements [- 1 ]) and "жк" not in address_elements [
510
+ - 1 ].lower () and
511
+ not any (
512
+ street_type in address_elements [- 1 ].lower () for street_type in STREET_TYPES )) and len (
513
+ address_elements [- 1 ]) < 10 :
514
+ location_data ["house_number" ] = address_elements [- 1 ].strip ()
515
+
494
516
if street_type in address_elements [- 1 ]:
495
517
location_data ["street" ] = address_elements [- 1 ].strip ()
496
518
if street_type == "улица" :
@@ -523,10 +545,12 @@ def define_price_data(self, block):
523
545
for element in elements :
524
546
if "₽/мес" in element .text :
525
547
price_description = element .text
526
- price_data ["price_per_month" ] = int ("" .join (price_description [:price_description .find ("₽/мес" ) - 1 ].split ()))
548
+ price_data ["price_per_month" ] = int (
549
+ "" .join (price_description [:price_description .find ("₽/мес" ) - 1 ].split ()))
527
550
528
551
if "%" in price_description :
529
- price_data ["commissions" ] = int (price_description [price_description .find ("%" ) - 2 :price_description .find ("%" )].replace (" " , "" ))
552
+ price_data ["commissions" ] = int (
553
+ price_description [price_description .find ("%" ) - 2 :price_description .find ("%" )].replace (" " , "" ))
530
554
531
555
return price_data
532
556
@@ -539,51 +563,40 @@ def define_price_data(self, block):
539
563
return price_data
540
564
541
565
def define_specification_data (self , block ):
566
+ specification_data = dict ()
567
+ specification_data ["floor" ] = - 1
568
+ specification_data ["floors_count" ] = - 1
569
+ specification_data ["rooms_count" ] = - 1
570
+ specification_data ["total_meters" ] = - 1
571
+
542
572
title = block .select ("div[data-name='LinkArea']" )[0 ].select ("div[data-name='GeneralInfoSectionRowComponent']" )[
543
573
0 ].text
544
574
545
575
common_properties = block .select ("div[data-name='LinkArea']" )[0 ]. \
546
576
select ("div[data-name='GeneralInfoSectionRowComponent']" )[0 ].text
547
577
548
- total_meters = None
549
578
if common_properties .find ("м²" ) is not None :
550
579
total_meters = title [: common_properties .find ("м²" )].replace ("," , "." )
551
580
if len (re .findall (FLOATS_NUMBERS_REG_EXPRESSION , total_meters )) != 0 :
552
- total_meters = float (re .findall (FLOATS_NUMBERS_REG_EXPRESSION , total_meters )[- 1 ].replace (" " , "" ).replace ("-" , "" ))
553
- else :
554
- total_meters = - 1
581
+ specification_data ["total_meters" ] = float (
582
+ re .findall (FLOATS_NUMBERS_REG_EXPRESSION , total_meters )[- 1 ].replace (" " , "" ).replace ("-" , "" ))
555
583
556
584
if "этаж" in common_properties :
557
585
floor_per = common_properties [common_properties .rfind ("этаж" ) - 7 : common_properties .rfind ("этаж" )]
586
+ floor_properties = floor_per .split ("/" )
558
587
559
- floor_per = floor_per .split ("/" )
588
+ if len (floor_properties ) == 2 :
589
+ ints = re .findall (r'\d+' , floor_properties [0 ])
590
+ if len (ints ) != 0 :
591
+ specification_data ["floor" ] = int (ints [- 1 ])
560
592
561
- if len (floor_per ) == 0 :
562
- floor , floors_count = - 1 , - 1
563
- else :
564
- floor , floors_count = floor_per [0 ], floor_per [1 ]
593
+ ints = re .findall (r'\d+' , floor_properties [1 ])
594
+ if len (ints ) != 0 :
595
+ specification_data ["floors_count" ] = int (ints [- 1 ])
565
596
566
- ints = re .findall (r'\d+' , floor )
567
- if len (ints ) == 0 :
568
- floor = - 1
569
- else :
570
- floor = int (ints [- 1 ])
597
+ specification_data ["rooms_count" ] = define_rooms_count (common_properties )
571
598
572
- ints = re .findall (r'\d+' , floors_count )
573
- if len (ints ) == 0 :
574
- floors_count = - 1
575
- else :
576
- floors_count = int (ints [- 1 ])
577
- else :
578
- floors_count = - 1
579
- floor = - 1
580
-
581
- return {
582
- "floor" : floor ,
583
- "floors_count" : floors_count ,
584
- "rooms_count" : define_rooms_count (common_properties ),
585
- "total_meters" : total_meters ,
586
- }
599
+ return specification_data
587
600
588
601
def parse_block (self , block ):
589
602
common_data = dict ()
@@ -597,7 +610,8 @@ def parse_block(self, block):
597
610
price_data = self .define_price_data (block = block )
598
611
specification_data = self .define_specification_data (block = block )
599
612
600
- if self .is_by_homeowner and (author_data ["author_type" ] != "unknown" and author_data ["author_type" ] != "homeowner" ):
613
+ if self .is_by_homeowner and (
614
+ author_data ["author_type" ] != "unknown" and author_data ["author_type" ] != "homeowner" ):
601
615
return
602
616
603
617
if self .is_latin :
@@ -641,19 +655,22 @@ def parse_block(self, block):
641
655
642
656
specification_data ["price_per_m2" ] = float (0 )
643
657
if "price" in price_data :
644
- self .average_price = (self .average_price * self .parsed_announcements_count + price_data ["price" ])/ (self .parsed_announcements_count + 1 )
645
- price_data ["price_per_m2" ] = int (float (price_data ["price" ])/ specification_data ["total_meters" ])
658
+ self .average_price = (self .average_price * self .parsed_announcements_count + price_data ["price" ]) / (
659
+ self .parsed_announcements_count + 1 )
660
+ price_data ["price_per_m2" ] = int (float (price_data ["price" ]) / specification_data ["total_meters" ])
646
661
elif "price_per_month" in price_data :
647
- self .average_price = (self .average_price * self .parsed_announcements_count + price_data ["price_per_month" ])/ (self .parsed_announcements_count + 1 )
648
- price_data ["price_per_m2" ] = int (float (price_data ["price_per_month" ])/ specification_data ["total_meters" ])
662
+ self .average_price = (self .average_price * self .parsed_announcements_count + price_data [
663
+ "price_per_month" ]) / (self .parsed_announcements_count + 1 )
664
+ price_data ["price_per_m2" ] = int (float (price_data ["price_per_month" ]) / specification_data ["total_meters" ])
649
665
650
666
self .parsed_announcements_count += 1
651
667
652
668
if define_id_url (common_data ["link" ]) in self .result_parsed :
653
669
return
654
670
655
671
self .result_parsed .add (define_id_url (common_data ["link" ]))
656
- self .result .append (self .union (author_data , common_data , specification_data , price_data , page_data , location_data ))
672
+ self .result .append (
673
+ self .union (author_data , common_data , specification_data , price_data , page_data , location_data ))
657
674
658
675
if self .is_saving_csv :
659
676
self .save_results ()
@@ -705,7 +722,8 @@ def save_results(self):
705
722
706
723
def load_and_parse_page (self , number_page , count_of_pages , attempt_number ):
707
724
html = self .load_page (number_page = number_page )
708
- return self .parse_page (html = html , number_page = number_page , count_of_pages = count_of_pages , attempt_number = attempt_number )
725
+ return self .parse_page (html = html , number_page = number_page , count_of_pages = count_of_pages ,
726
+ attempt_number = attempt_number )
709
727
710
728
def run (self ):
711
729
print (f"\n { ' ' * 30 } Preparing to collect information from pages.." )
0 commit comments