import requests from bs4 import BeautifulSoup import re import json import asyncio import aiohttp import time from googleapiclient.discovery import build from google.oauth2 import service_account import pandas as pd import openpyxl SCOPES = [ 'https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive' ] credentials = service_account.Credentials.from_service_account_file('creds_service_acc.json', scopes=SCOPES) # spreadsheet_service = build('sheets', 'v4', credentials=credentials) # drive_service = build('drive', 'v3', credentials=credentials) try: spreadsheet_service = build('sheets', 'v4', credentials=credentials) except: DISCOVERY_SERVICE_URL = 'https://sheets.googleapis.com/$discovery/rest?version=v4' spreadsheet_service = build('sheets', 'v4', credentials=credentials, discoveryServiceUrl=DISCOVERY_SERVICE_URL) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'} def files_jso(): with open("./tovar.json", "w", encoding='utf8') as file: file.write(f'[') def files_js(): with open("./tovar.json", "r", encoding='utf8') as file: x = file.read() with open("./tovar.json", "w", encoding='utf8') as file1: file1.write(x[:-2]) with open("./tovar.json", "a", encoding='utf8') as file3: file3.write(f']') def files_tovar(): with open("./urltovar.txt", "w", encoding='utf8') as file: file.write(f'') with open("./catalogall.txt", "w", encoding='utf8') as file: file.write(f'') def prov_pagen(): file1 = open('./catalog.txt', "r") while True: # Цикл для блокнота что бы пробегал по каждой строке) line = file1.readline() if not line: break url = line.strip() s = requests.Session() r = s.get(url, headers=headers).text soup = BeautifulSoup(r, 'html.parser') pagen = soup.find_all('select', {"id" : 'nb_page_items'}) p = str(pagen) u = ( ' '.join(re.findall(r'>([^<>]+) ', str(p))) ) # print(u) if len(u) == 0: print(url) with open("./catalogall.txt", "a") as file: file.write(url + '\n') else: print(f'{url}?n={u}') with open("./catalogall.txt", "a") as file: file.write(f'{url}?n={u}' + '\n') def catalog(): url='https://allsuvenir.com.ua/ru/' s = requests.Session() res = s.get(url, headers=headers).text soup = BeautifulSoup(res, 'html.parser') items = soup.find_all('div', class_= 'list') # print(items) url = ( ' '.join(re.findall(r'', str(items))) ) line = url.replace(' ', '\n') # catalog = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line) # print(len(catalog)) with open("./catalog.txt", "w") as file: file.write(line) def urls_tovar(): file1 = open('./catalogall.txt', "r") while True: # Цикл для блокнота что бы пробегал по каждой строке) line = file1.readline() if not line: break url = line.strip() # url = url1+'&n=135' print(url) # url = 'https://allsuvenir.com.ua/ru/114-zazhigalki?id_category=114&n=135' s = requests.Session() r = s.get(url, headers=headers).text soup = BeautifulSoup(r, 'html.parser') items = soup.find_all('ul', class_='product_list grid row') # print(items) url = ( ' '.join(re.findall(r'"product-name" href="([^<>]+)" itemprop="url"', str(items))) ) line = url.replace(' ', '\n') # tovar = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line) print(line) with open("./urltovar.txt", "a") as file: file.write(line + '\n') async def main1(): async with aiohttp.ClientSession() as session: file1 = open('./urltovar.txt', "r") while True: # Цикл для блокнота что бы пробегал по каждой строке) line = file1.readline() if not line: break url = line.strip() # url = 'https://allsuvenir.com.ua/ru/glavnaya/1344-mk14.html?search_query=MK14&results=2' # print(url) async with session.get(url, headers=headers)as response: # await asyncio.sleep(0.2) r = await aiohttp.StreamReader.read(response.content) soup = BeautifulSoup(r, 'html.parser') print(f'===========> {url}') try: table = soup.find_all('div', {"id" : 'center_column'}) for item in table: json_list = [] # js = [] try: name = item.find('div', class_='pb-center-column col-xs-12 col-sm-4').find('h1').get_text(strip=True) print(name) code = item.find('p', {"id" : 'product_reference'}).get_text(strip=True) codes = code.replace('Артикул', '') try: ops = item.find('div', {"id" : 'short_description_content'}).get_text() print(ops) except Exception: ops = "" prices = item.find('span', {"id" : 'our_price_display'}).text price = prices[:prices.find('$')] fotos = item.find('ul', {"id" : 'thumbs_list_frame'}) foto1 = ( ' '.join(re.findall(r'"other-views" href="([^<>]+)" title="', str(fotos))) ) foto = foto1.replace(' ', ', ') js1 = {'code': f'ALL_{codes}', 'name': name, 'price': price, 'foto': foto, 'op': ops} with open("./tovar.json", "a", encoding='utf8') as file: json.dump(js1, file, ensure_ascii=False) file.write(',' + '\n') with open("./urltovar.txt", "r", encoding='utf8') as f: lines = f.readlines() pattern = re.compile(re.escape(url)) with open("./urltovar.txt", "w", encoding='utf8') as f: for line in lines: result = pattern.search(line) if result is None: f.write(line) print(codes) except Exception as a: # else: print(a) except Exception as a: # else: print(a) time.sleep(100) print('____________________________________________________________') pass def search_id(): js = [] with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file: rr = json.load(file) for r in rr: codes = r['code'] names = r['name'] price = float(r['price']) foto = r['foto'] op = r['op'] # print(op) ange_name = 'Export Products Sheet!H1:H400' spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc' # КОД страницы) range_ = 'Export Products Sheet!AF1:AF15000' # value_render_option = '40' date_time_render_option = '' request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option) response = request.execute() # print(type(response)) a = (response['values']) search_ids = [f'{codes}'] # print(search_ids) time.sleep(1) if search_ids in a: # b = (a.index( search_ids) ) # print('go search') for i, j in enumerate(a): if j == search_ids: # print(i+1) stroka = i+1 print(f'[=>] Нашёл в таблице! Код товара: {codes} В стороке: {stroka} Цена: {price}') if price > 0.00: tovar = [['+']] price1 = [[f'{price}']] resp = spreadsheet_service.spreadsheets().values().update( spreadsheetId=spreadsheet_id, range="Export Products Sheet!P{}".format(stroka), valueInputOption="RAW", body={'values' : tovar }).execute() resp2 = spreadsheet_service.spreadsheets().values().update( spreadsheetId=spreadsheet_id, range="Export Products Sheet!Z{}".format(stroka), valueInputOption="RAW", body={'values' : price1 }).execute() elif price == 0.00: tovar = [['-']] price1 = [[f'{price}']] resp = spreadsheet_service.spreadsheets().values().update( spreadsheetId=spreadsheet_id, range="Export Products Sheet!P{}".format(stroka), valueInputOption="RAW", body={'values' : tovar }).execute() resp2 = spreadsheet_service.spreadsheets().values().update( spreadsheetId=spreadsheet_id, range="Export Products Sheet!Z{}".format(stroka), valueInputOption="RAW", body={'values' : price1 }).execute() # print ('-------!!!!!!!') else: print('not') else: # # pass js1 = {'code': codes, 'name': names, 'price': price, 'foto': foto, 'op': op} js.append(js1) # print(js) DF = pd.DataFrame.from_dict(js) DF.to_excel( './all_notfiles.xlsx', sheet_name='no product', index=False) # print(f'[=>] Записал в файл {codes}') def find_elements_starting_with(a, prefix): return [s for s in a if s.startswith(prefix)] def finds_ids(): # with open("./idsexes.txt", "w", encoding='utf8') as file: # file.write(f'') lis = [] valuesss = [[int(0)]] with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file: rr = json.load(file) for r in rr: codes1 = r['code'] codes = f'ALL_{codes1}' lis.append(codes) # print(lis) ange_name = 'Export Products Sheet!H1:H400' spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'#'1c5gAmgzLqbNfAC6a4DL7F4KDPXI5xn3joD0Z8Kxr-jY' # КОД страницы) range_ = 'Export Products Sheet!AF2:AF15000' # value_render_option = '40' date_time_render_option = '' request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option) response = request.execute() # print(type(response)) a = (response['values']) time.sleep(2) st = str(a).replace('[', '') st1 = st.replace(']', '') st2 = st1.replace("'", '') li = list(st2.split(", ")) prefix = 'ALL' resultgoog = find_elements_starting_with(li, prefix) result = [] # print(a) for iii in resultgoog: if iii not in lis: result.append(iii) search_ids = [f'{iii}'] time.sleep(2) if search_ids in a: # b = (a.index( search_ids) ) # print('go search') for i, j in enumerate(a): # time.sleep(1) if j == search_ids: # print(i+1) stroka = i+2 print(f'[=>] Нашёл в таблице! Код товара: {search_ids} В стороке: {stroka}! Этого товара не было при парсинге!') tovar = [['-']] resp = spreadsheet_service.spreadsheets().values().update( spreadsheetId=spreadsheet_id, range="Export Products Sheet!P{}".format(stroka), valueInputOption="RAW", body={'values' : tovar }).execute() # resp = spreadsheet_service.spreadsheets().values().update( # spreadsheetId=spreadsheet_id, # range="Export Products Sheet!Z{}".format(stroka), # valueInputOption="RAW", # body={'values' : valuesss }).execute() if __name__ == '__main__': files_tovar() catalog() prov_pagen() urls_tovar() files_jso() print('[*] Проверяю что осталось в файле') time.sleep(10) check_finish1 = True while check_finish1: a = len(re.findall(r"[\n]+?", open('./urltovar.txt').read())) print(a) if a > 1: try: loop = asyncio.get_event_loop() loop.run_until_complete(main1()) check_finish1 = False except Exception: print('[*]Новое подключние') time.sleep(10) else: break # loop = asyncio.get_event_loop() # loop.run_until_complete(main1()) files_js() # try: # search_id() # except Exception as e: # print(e) # print('[*] Не залил весь товар.') # # # try: # finds_ids() # except Exception as e: # print(e) # print('[*] Не проверил весь товар которого не было на сайте.')