From 2af75cd9c6bcf2bf233e31fdc25f61549b846396 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 18 Oct 2023 13:23:33 +0300 Subject: [PATCH] first commit --- .gitignore | 26 ++++ README.md | 39 +++++ main.py | 11 ++ on.py | 387 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 37 +++++ 5 files changed, 500 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 main.py create mode 100644 on.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4901c74 --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +# Игнорировать файлы с расширением .log и .tmp +*.log +*.tmp +*.html +# Игнорировать директорию с именем temp +temp/ + +# Игнорировать файл config.ini +config.ini + +# Игнорировать все файлы в директории temp_folder +temp_folder/* + +# Игнорировать все файлы и поддиректории в директории logs +logs/* +.venv/ +test.py +prov.py +all_notfiles.xlsx +creds_service_acc.json +*.zip +*.db +*.txt +*.json +# Игнорировать все файлы и директории, кроме файла important.txt и директории docs +!/requirements.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..7a7e39a --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# Парсер для сайта https://allsuvenir.com.ua/ru/ + +* 👉 Парсим каталог +* 👉 Потом с каталогов весь товар +* 👉 Потом в файл JSON +* 👉 Потом в Гугл таблицу + +* 👉 Товара которого нет в таблице записываем в отдельную страницу +* 👉 И товара которого не было в парсинге ставим "-" В таблице + + +``` + python -m venv venv +``` + +``` + C:/python/NAME/venv/Scripts/Activate.ps1 +``` +``` + pip install -r requirements.txt +``` +``` + pip install pyinstaller +``` +``` + pyinstaller --onefile --console on.py +``` +или +``` + pip install auto-py-to-exe +``` +``` + auto-py-to-exe +``` + + + +> Что нужно не забыть! +> creds_service_acc.json У каждого свой! Это для гугл таблицы! diff --git a/main.py b/main.py new file mode 100644 index 0000000..65672e6 --- /dev/null +++ b/main.py @@ -0,0 +1,11 @@ +import subprocess +def start(): + # Замените 'your_script.py' на имя вашего файла + file_to_run = 'on.py' + + # Запускаем файл + subprocess.run(['python', file_to_run]) + + +if __name__ == '__main__': + start() \ No newline at end of file diff --git a/on.py b/on.py new file mode 100644 index 0000000..bc112e6 --- /dev/null +++ b/on.py @@ -0,0 +1,387 @@ +import requests +from bs4 import BeautifulSoup +import re +import json +import asyncio +import aiohttp +import time +from googleapiclient.discovery import build +from google.oauth2 import service_account +import pandas as pd +import openpyxl + + +SCOPES = [ +'https://www.googleapis.com/auth/spreadsheets', +'https://www.googleapis.com/auth/drive' +] + + + +credentials = service_account.Credentials.from_service_account_file('creds_service_acc.json', scopes=SCOPES) + +# spreadsheet_service = build('sheets', 'v4', credentials=credentials) +# drive_service = build('drive', 'v3', credentials=credentials) + +try: + spreadsheet_service = build('sheets', 'v4', credentials=credentials) +except: + DISCOVERY_SERVICE_URL = 'https://sheets.googleapis.com/$discovery/rest?version=v4' + spreadsheet_service = build('sheets', 'v4', credentials=credentials, discoveryServiceUrl=DISCOVERY_SERVICE_URL) + +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'} + +def files_jso(): + with open("./tovar.json", "w", encoding='utf8') as file: + file.write(f'[') + + +def files_js(): + + with open("./tovar.json", "r", encoding='utf8') as file: + x = file.read() + + with open("./tovar.json", "w", encoding='utf8') as file1: + file1.write(x[:-2]) + + with open("./tovar.json", "a", encoding='utf8') as file3: + file3.write(f']') + +def files_tovar(): + with open("./urltovar.txt", "w", encoding='utf8') as file: + file.write(f'') + with open("./catalogall.txt", "w", encoding='utf8') as file: + file.write(f'') + + + +def prov_pagen(): + file1 = open('./catalog.txt', "r") + while True: # Цикл для блокнота что бы пробегал по каждой строке) + line = file1.readline() + if not line: + break + url = line.strip() + s = requests.Session() + r = s.get(url, headers=headers).text + soup = BeautifulSoup(r, 'html.parser') + pagen = soup.find_all('select', {"id" : 'nb_page_items'}) + p = str(pagen) + u = ( ' '.join(re.findall(r'>([^<>]+) ', str(p))) ) + # print(u) + if len(u) == 0: + print(url) + with open("./catalogall.txt", "a") as file: + file.write(url + '\n') + else: + print(f'{url}?n={u}') + with open("./catalogall.txt", "a") as file: + file.write(f'{url}?n={u}' + '\n') + + +def catalog(): + url='https://allsuvenir.com.ua/ru/' + s = requests.Session() + res = s.get(url, headers=headers).text + + + soup = BeautifulSoup(res, 'html.parser') + items = soup.find_all('div', class_= 'list') + # print(items) + url = ( ' '.join(re.findall(r'', str(items))) ) + line = url.replace(' ', '\n') + # catalog = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line) + # print(len(catalog)) + with open("./catalog.txt", "w") as file: + file.write(line) + + +def urls_tovar(): + file1 = open('./catalogall.txt', "r") + while True: # Цикл для блокнота что бы пробегал по каждой строке) + line = file1.readline() + if not line: + break + url = line.strip() + # url = url1+'&n=135' + print(url) + # url = 'https://allsuvenir.com.ua/ru/114-zazhigalki?id_category=114&n=135' + s = requests.Session() + r = s.get(url, headers=headers).text + + soup = BeautifulSoup(r, 'html.parser') + items = soup.find_all('ul', class_='product_list grid row') + # print(items) + + url = ( ' '.join(re.findall(r'"product-name" href="([^<>]+)" itemprop="url"', str(items))) ) + line = url.replace(' ', '\n') + # tovar = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line) + print(line) + with open("./urltovar.txt", "a") as file: + file.write(line + '\n') + + +async def main1(): + async with aiohttp.ClientSession() as session: + + file1 = open('./urltovar.txt', "r") + while True: # Цикл для блокнота что бы пробегал по каждой строке) + line = file1.readline() + if not line: + + break + url = line.strip() + # url = 'https://allsuvenir.com.ua/ru/glavnaya/1344-mk14.html?search_query=MK14&results=2' + + # print(url) + async with session.get(url, headers=headers)as response: + # await asyncio.sleep(0.2) + + + r = await aiohttp.StreamReader.read(response.content) + soup = BeautifulSoup(r, 'html.parser') + print(f'===========> {url}') + try: + table = soup.find_all('div', {"id" : 'center_column'}) + for item in table: + json_list = [] + # js = [] + try: + name = item.find('div', class_='pb-center-column col-xs-12 col-sm-4').find('h1').get_text(strip=True) + print(name) + code = item.find('p', {"id" : 'product_reference'}).get_text(strip=True) + codes = code.replace('Артикул', '') + + try: + ops = item.find('div', {"id" : 'short_description_content'}).get_text() + print(ops) + except Exception: + ops = "" + + + prices = item.find('span', {"id" : 'our_price_display'}).text + price = prices[:prices.find('$')] + fotos = item.find('ul', {"id" : 'thumbs_list_frame'}) + foto1 = ( ' '.join(re.findall(r'"other-views" href="([^<>]+)" title="', str(fotos))) ) + foto = foto1.replace(' ', ', ') + + + + js1 = {'code': f'ALL_{codes}', + 'name': name, + 'price': price, + 'foto': foto, + 'op': ops} + + + with open("./tovar.json", "a", encoding='utf8') as file: + + json.dump(js1, file, ensure_ascii=False) + file.write(',' + '\n') + with open("./urltovar.txt", "r", encoding='utf8') as f: + lines = f.readlines() + + pattern = re.compile(re.escape(url)) + with open("./urltovar.txt", "w", encoding='utf8') as f: + for line in lines: + result = pattern.search(line) + if result is None: + f.write(line) + + print(codes) + except Exception as a: + # else: + print(a) + except Exception as a: + # else: + print(a) + time.sleep(100) + print('____________________________________________________________') + pass + +def search_id(): + js = [] + with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file: + rr = json.load(file) + for r in rr: + codes = r['code'] + names = r['name'] + price = float(r['price']) + foto = r['foto'] + op = r['op'] + # print(op) + + ange_name = 'Export Products Sheet!H1:H400' + spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc' # КОД страницы) + range_ = 'Export Products Sheet!AF1:AF15000' + # value_render_option = '40' + date_time_render_option = '' + request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option) + response = request.execute() + + # print(type(response)) + a = (response['values']) + search_ids = [f'{codes}'] + # print(search_ids) + time.sleep(1) + if search_ids in a: + + # b = (a.index( search_ids) ) + # print('go search') + + for i, j in enumerate(a): + if j == search_ids: + # print(i+1) + stroka = i+1 + print(f'[=>] Нашёл в таблице! Код товара: {codes} В стороке: {stroka} Цена: {price}') + if price > 0.00: + tovar = [['+']] + price1 = [[f'{price}']] + resp = spreadsheet_service.spreadsheets().values().update( + spreadsheetId=spreadsheet_id, + range="Export Products Sheet!P{}".format(stroka), + valueInputOption="RAW", + body={'values' : tovar }).execute() + resp2 = spreadsheet_service.spreadsheets().values().update( + spreadsheetId=spreadsheet_id, + range="Export Products Sheet!Z{}".format(stroka), + valueInputOption="RAW", + body={'values' : price1 }).execute() + + elif price == 0.00: + tovar = [['-']] + price1 = [[f'{price}']] + resp = spreadsheet_service.spreadsheets().values().update( + spreadsheetId=spreadsheet_id, + range="Export Products Sheet!P{}".format(stroka), + valueInputOption="RAW", + body={'values' : tovar }).execute() + resp2 = spreadsheet_service.spreadsheets().values().update( + spreadsheetId=spreadsheet_id, + range="Export Products Sheet!Z{}".format(stroka), + valueInputOption="RAW", + body={'values' : price1 }).execute() + # print ('-------!!!!!!!') + else: + print('not') + + + else: + # + # pass + js1 = {'code': codes, + 'name': names, + 'price': price, + 'foto': foto, + 'op': op} + js.append(js1) + # print(js) + DF = pd.DataFrame.from_dict(js) + DF.to_excel( './all_notfiles.xlsx', sheet_name='no product', index=False) + # print(f'[=>] Записал в файл {codes}') + + +def find_elements_starting_with(a, prefix): + return [s for s in a if s.startswith(prefix)] + +def finds_ids(): + # with open("./idsexes.txt", "w", encoding='utf8') as file: + # file.write(f'') + + + lis = [] + valuesss = [[int(0)]] + with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file: + rr = json.load(file) + for r in rr: + codes1 = r['code'] + codes = f'ALL_{codes1}' + lis.append(codes) + # print(lis) + ange_name = 'Export Products Sheet!H1:H400' + spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'#'1c5gAmgzLqbNfAC6a4DL7F4KDPXI5xn3joD0Z8Kxr-jY' # КОД страницы) + range_ = 'Export Products Sheet!AF2:AF15000' + # value_render_option = '40' + date_time_render_option = '' + request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option) + response = request.execute() + + # print(type(response)) + a = (response['values']) + time.sleep(2) + st = str(a).replace('[', '') + st1 = st.replace(']', '') + st2 = st1.replace("'", '') + + li = list(st2.split(", ")) + prefix = 'ALL' + resultgoog = find_elements_starting_with(li, prefix) + result = [] + # print(a) + for iii in resultgoog: + if iii not in lis: + result.append(iii) + search_ids = [f'{iii}'] + time.sleep(2) + if search_ids in a: + + # b = (a.index( search_ids) ) + # print('go search') + + for i, j in enumerate(a): + # time.sleep(1) + if j == search_ids: + # print(i+1) + stroka = i+2 + print(f'[=>] Нашёл в таблице! Код товара: {search_ids} В стороке: {stroka}! Этого товара не было при парсинге!') + tovar = [['-']] + resp = spreadsheet_service.spreadsheets().values().update( + spreadsheetId=spreadsheet_id, + range="Export Products Sheet!P{}".format(stroka), + valueInputOption="RAW", + body={'values' : tovar }).execute() + + # resp = spreadsheet_service.spreadsheets().values().update( + # spreadsheetId=spreadsheet_id, + # range="Export Products Sheet!Z{}".format(stroka), + # valueInputOption="RAW", + # body={'values' : valuesss }).execute() + + +if __name__ == '__main__': + files_tovar() + catalog() + prov_pagen() + urls_tovar() + files_jso() + print('[*] Проверяю что осталось в файле') + time.sleep(10) + check_finish1 = True + while check_finish1: + a = len(re.findall(r"[\n]+?", open('./urltovar.txt').read())) + print(a) + if a > 1: + try: + loop = asyncio.get_event_loop() + loop.run_until_complete(main1()) + check_finish1 = False + except Exception: + print('[*]Новое подключние') + time.sleep(10) + else: + break + # loop = asyncio.get_event_loop() + # loop.run_until_complete(main1()) + files_js() + # try: + # search_id() + # except Exception as e: + # print(e) + # print('[*] Не залил весь товар.') + # # + # try: + # finds_ids() + # except Exception as e: + # print(e) + # print('[*] Не проверил весь товар которого не было на сайте.') + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b5a4af5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,37 @@ +aiohttp==3.8.3 +aiosignal==1.3.1 +async-timeout==4.0.2 +attrs==22.2.0 +beautifulsoup4==4.11.1 +cachetools==5.3.0 +certifi==2022.12.7 +charset-normalizer==2.1.1 +et-xmlfile==1.1.0 +frozenlist==1.3.3 +google-api-core==2.11.0 +google-api-python-client==2.74.0 +google-auth==2.16.0 +google-auth-httplib2==0.1.0 +google-auth-oauthlib==0.8.0 +googleapis-common-protos==1.58.0 +httplib2==0.21.0 +idna==3.4 +multidict==6.0.4 +numpy==1.24.1 +oauthlib==3.2.2 +openpyxl==3.0.10 +pandas==1.5.3 +protobuf==4.21.12 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2022.7.1 +requests==2.28.2 +requests-oauthlib==1.3.1 +rsa==4.9 +six==1.16.0 +soupsieve==2.3.2.post1 +uritemplate==4.1.1 +urllib3==1.26.14 +yarl==1.8.2