first commit

2023-10-18 13:23:33 +03:00 · 2023-10-18 13:23:33 +03:00 · 2af75cd9c6
commit 2af75cd9c6
5 changed files with 500 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,26 @@
+# Игнорировать файлы с расширением .log и .tmp
+*.log
+*.tmp
+*.html
+# Игнорировать директорию с именем temp
+temp/
+
+# Игнорировать файл config.ini
+config.ini
+
+# Игнорировать все файлы в директории temp_folder
+temp_folder/*
+
+# Игнорировать все файлы и поддиректории в директории logs
+logs/*
+.venv/
+test.py
+prov.py
+all_notfiles.xlsx
+creds_service_acc.json
+*.zip
+*.db
+*.txt
+*.json
+# Игнорировать все файлы и директории, кроме файла important.txt и директории docs
+!/requirements.txt
--- a/README.md
+++ b/README.md
@ -0,0 +1,39 @@
+# Парсер для сайта https://allsuvenir.com.ua/ru/
+
+* 👉 Парсим каталог 
+* 👉 Потом с каталогов весь товар
+* 👉 Потом в файл JSON 
+* 👉 Потом в Гугл таблицу
+
+* 👉 Товара которого нет в таблице записываем в отдельную страницу
+* 👉 И товара которого не было в парсинге ставим "-" В таблице 
+
+
+```
+   python -m venv venv
+```
+
+```
+   C:/python/NAME/venv/Scripts/Activate.ps1
+```
+```
+    pip install -r requirements.txt
+```
+```
+    pip install pyinstaller
+```
+```
+    pyinstaller --onefile --console on.py
+```
+или 
+```
+    pip install auto-py-to-exe  
+```
+```
+    auto-py-to-exe 
+```
+
+
+
+> Что нужно не забыть! 
+> creds_service_acc.json У каждого свой! Это для гугл таблицы! 
--- a/main.py
+++ b/main.py
@ -0,0 +1,11 @@
+import subprocess
+def start():
+    # Замените 'your_script.py' на имя вашего файла
+    file_to_run = 'on.py'
+
+    # Запускаем файл
+    subprocess.run(['python', file_to_run])
+
+
+if __name__ == '__main__':
+    start()    
--- a/on.py
+++ b/on.py
@ -0,0 +1,387 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+import json
+import asyncio
+import aiohttp
+import time
+from googleapiclient.discovery import build 
+from google.oauth2 import service_account
+import pandas as pd
+import openpyxl
+
+
+SCOPES = [
+'https://www.googleapis.com/auth/spreadsheets',
+'https://www.googleapis.com/auth/drive'
+]
+
+
+
+credentials = service_account.Credentials.from_service_account_file('creds_service_acc.json', scopes=SCOPES)
+
+# spreadsheet_service = build('sheets', 'v4', credentials=credentials)
+# drive_service = build('drive', 'v3', credentials=credentials)
+
+try:
+  spreadsheet_service = build('sheets', 'v4', credentials=credentials)
+except:
+  DISCOVERY_SERVICE_URL = 'https://sheets.googleapis.com/$discovery/rest?version=v4'
+  spreadsheet_service = build('sheets', 'v4', credentials=credentials, discoveryServiceUrl=DISCOVERY_SERVICE_URL)
+
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
+
+def files_jso():
+    with open("./tovar.json", "w", encoding='utf8') as file:
+        file.write(f'[')
+        
+
+def files_js():
+    
+    with open("./tovar.json", "r", encoding='utf8') as file:
+        x = file.read()
+       
+    with open("./tovar.json", "w", encoding='utf8') as file1:
+        file1.write(x[:-2])  
+        
+    with open("./tovar.json", "a", encoding='utf8') as file3:
+        file3.write(f']')  
+
+def files_tovar():
+    with open("./urltovar.txt", "w", encoding='utf8') as file:
+        file.write(f'')
+    with open("./catalogall.txt", "w", encoding='utf8') as file:
+        file.write(f'')
+    
+        
+        
+def prov_pagen():
+    file1 = open('./catalog.txt', "r") 
+    while True: # Цикл для блокнота что бы пробегал по каждой строке) 
+        line = file1.readline()
+        if not line:
+            break
+        url = line.strip()
+        s = requests.Session()
+        r = s.get(url, headers=headers).text
+        soup = BeautifulSoup(r, 'html.parser')
+        pagen = soup.find_all('select', {"id" : 'nb_page_items'})
+        p = str(pagen)
+        u = ( ' '.join(re.findall(r'>([^<>]+)</option> </select>', str(p))) )
+        # print(u)
+        if len(u) == 0:
+            print(url)
+            with open("./catalogall.txt", "a") as file:
+                file.write(url + '\n')
+        else:
+            print(f'{url}?n={u}')
+            with open("./catalogall.txt", "a") as file:
+                file.write(f'{url}?n={u}' + '\n')
+                
+                
+def catalog():
+    url='https://allsuvenir.com.ua/ru/'
+    s = requests.Session()       
+    res = s.get(url, headers=headers).text
+    
+        
+    soup = BeautifulSoup(res, 'html.parser')
+    items = soup.find_all('div', class_= 'list')
+    # print(items)
+    url = ( ' '.join(re.findall(r'<a href="([^<>]+)" title="">', str(items))) )
+    line = url.replace(' ', '\n')
+    # catalog = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
+    # print(len(catalog))
+    with open("./catalog.txt", "w") as file:
+                file.write(line)
+    
+    
+def urls_tovar():
+    file1 = open('./catalogall.txt', "r") 
+    while True: # Цикл для блокнота что бы пробегал по каждой строке) 
+        line = file1.readline()
+        if not line:
+            break
+        url = line.strip()
+        # url = url1+'&n=135'
+        print(url)
+        # url = 'https://allsuvenir.com.ua/ru/114-zazhigalki?id_category=114&n=135'
+        s = requests.Session()
+        r = s.get(url, headers=headers).text
+        
+        soup = BeautifulSoup(r, 'html.parser')
+        items = soup.find_all('ul', class_='product_list grid row')
+        # print(items)
+        
+        url = ( ' '.join(re.findall(r'"product-name" href="([^<>]+)" itemprop="url"', str(items))) )
+        line = url.replace(' ', '\n')
+        # tovar = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
+        print(line)
+        with open("./urltovar.txt", "a") as file:
+                file.write(line + '\n')
+    
+
+async def main1():
+    async with aiohttp.ClientSession() as session:
+       
+            file1 = open('./urltovar.txt', "r") 
+            while True: # Цикл для блокнота что бы пробегал по каждой строке) 
+                line = file1.readline()
+                if not line:
+                    
+                    break
+                url = line.strip()
+                # url = 'https://allsuvenir.com.ua/ru/glavnaya/1344-mk14.html?search_query=MK14&results=2'
+            
+                # print(url)
+                async with session.get(url, headers=headers)as response:
+                    # await asyncio.sleep(0.2)
+                    
+                
+                    r = await aiohttp.StreamReader.read(response.content)
+                    soup = BeautifulSoup(r, 'html.parser')
+                    print(f'===========> {url}')        
+                    try:
+                        table = soup.find_all('div', {"id" : 'center_column'})
+                        for item in table:
+                            json_list = []
+                            # js = []
+                            try:
+                                name = item.find('div', class_='pb-center-column col-xs-12 col-sm-4').find('h1').get_text(strip=True)
+                                print(name)
+                                code = item.find('p', {"id" : 'product_reference'}).get_text(strip=True)
+                                codes = code.replace('Артикул', '')
+                                
+                                try:
+                                    ops = item.find('div', {"id" : 'short_description_content'}).get_text()
+                                    print(ops) 
+                                except Exception:
+                                    ops = ""
+                                    
+                                
+                                prices = item.find('span', {"id" : 'our_price_display'}).text
+                                price = prices[:prices.find('$')]
+                                fotos = item.find('ul', {"id" : 'thumbs_list_frame'})
+                                foto1 = ( ' '.join(re.findall(r'"other-views" href="([^<>]+)" title="', str(fotos))) )
+                                foto = foto1.replace(' ', ', ')
+                                
+                                
+                                
+                                js1 = {'code': f'ALL_{codes}',
+                                    'name': name,
+                                    'price': price,
+                                    'foto': foto,
+                                    'op': ops} 
+                        
+                            
+                                with open("./tovar.json", "a", encoding='utf8') as file:
+                                    
+                                    json.dump(js1, file, ensure_ascii=False)
+                                    file.write(',' + '\n')
+                                with open("./urltovar.txt", "r", encoding='utf8') as f:
+                                    lines = f.readlines()
+                                
+                                pattern = re.compile(re.escape(url)) 
+                                with open("./urltovar.txt", "w", encoding='utf8') as f:
+                                    for line in lines:
+                                        result = pattern.search(line)
+                                        if result is None:
+                                            f.write(line)
+                                
+                                print(codes)
+                            except Exception as a:
+                            # else:  
+                                print(a)
+                    except Exception as a:
+                    # else:  
+                        print(a)
+                        time.sleep(100)
+                        print('____________________________________________________________')
+                        pass
+
+def search_id():
+    js = []
+    with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
+        rr = json.load(file)
+        for r in rr:
+            codes = r['code']
+            names = r['name']
+            price = float(r['price'])
+            foto = r['foto']
+            op = r['op']
+            # print(op)
+
+            ange_name = 'Export Products Sheet!H1:H400'
+            spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'  # КОД страницы) 
+            range_ = 'Export Products Sheet!AF1:AF15000'
+            # value_render_option = '40'
+            date_time_render_option = ''    
+            request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
+            response = request.execute()
+
+            # print(type(response))
+            a = (response['values'])
+            search_ids = [f'{codes}']
+            # print(search_ids)
+            time.sleep(1)
+            if search_ids in a:
+
+                # b = (a.index( search_ids) )
+                # print('go search')
+                
+                for i, j in enumerate(a):
+                    if j == search_ids:
+                        # print(i+1)
+                        stroka = i+1
+                        print(f'[=>] Нашёл в таблице! Код товара: {codes} В стороке: {stroka} Цена: {price}')
+                        if price > 0.00:
+                            tovar = [['+']]
+                            price1 = [[f'{price}']]
+                            resp = spreadsheet_service.spreadsheets().values().update(
+                            spreadsheetId=spreadsheet_id,
+                            range="Export Products Sheet!P{}".format(stroka),
+                            valueInputOption="RAW",
+                            body={'values' : tovar }).execute()
+                            resp2 = spreadsheet_service.spreadsheets().values().update(
+                            spreadsheetId=spreadsheet_id,
+                            range="Export Products Sheet!Z{}".format(stroka),
+                            valueInputOption="RAW",
+                            body={'values' : price1 }).execute()
+                            
+                        elif price == 0.00:
+                            tovar = [['-']]
+                            price1 = [[f'{price}']]
+                            resp = spreadsheet_service.spreadsheets().values().update(
+                            spreadsheetId=spreadsheet_id,
+                            range="Export Products Sheet!P{}".format(stroka),
+                            valueInputOption="RAW",
+                            body={'values' : tovar }).execute()
+                            resp2 = spreadsheet_service.spreadsheets().values().update(
+                            spreadsheetId=spreadsheet_id,
+                            range="Export Products Sheet!Z{}".format(stroka),
+                            valueInputOption="RAW",
+                            body={'values' : price1 }).execute()
+                            # print ('-------!!!!!!!')
+                        else:
+                            print('not')
+                    
+                    
+            else:
+                # 
+                # pass
+                js1 = {'code': codes,
+                            'name': names,
+                            'price': price,
+                            'foto': foto,
+                            'op': op} 
+                js.append(js1)
+                # print(js)
+                DF = pd.DataFrame.from_dict(js)
+                DF.to_excel( './all_notfiles.xlsx', sheet_name='no product', index=False)
+                # print(f'[=>] Записал в файл {codes}')
+                
+                
+def find_elements_starting_with(a, prefix):
+    return [s for s in a if s.startswith(prefix)]
+                
+def finds_ids():
+    # with open("./idsexes.txt", "w", encoding='utf8') as file:
+    #     file.write(f'')
+
+    
+    lis = []
+    valuesss = [[int(0)]]
+    with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
+            rr = json.load(file)
+            for r in rr:
+                codes1 = r['code']
+                codes = f'ALL_{codes1}'
+                lis.append(codes)
+    # print(lis)
+    ange_name = 'Export Products Sheet!H1:H400'
+    spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'#'1c5gAmgzLqbNfAC6a4DL7F4KDPXI5xn3joD0Z8Kxr-jY'  # КОД страницы) 
+    range_ = 'Export Products Sheet!AF2:AF15000'
+    # value_render_option = '40'
+    date_time_render_option = ''    
+    request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
+    response = request.execute()
+
+    # print(type(response))
+    a = (response['values'])
+    time.sleep(2)
+    st = str(a).replace('[', '')
+    st1 = st.replace(']', '')
+    st2 = st1.replace("'", '')
+
+    li = list(st2.split(", "))
+    prefix = 'ALL'
+    resultgoog = find_elements_starting_with(li, prefix)
+    result = []
+    # print(a)
+    for iii in resultgoog:
+        if iii not in lis:
+            result.append(iii)
+            search_ids = [f'{iii}']
+            time.sleep(2)
+            if search_ids in a:
+
+                # b = (a.index( search_ids) )
+                # print('go search')
+                
+                for i, j in enumerate(a):
+                    # time.sleep(1)
+                    if j == search_ids:
+                        # print(i+1)
+                        stroka = i+2
+                        print(f'[=>] Нашёл в таблице! Код товара: {search_ids} В стороке: {stroka}! Этого товара не было при парсинге!')
+                        tovar = [['-']]
+                        resp = spreadsheet_service.spreadsheets().values().update(
+                        spreadsheetId=spreadsheet_id,
+                        range="Export Products Sheet!P{}".format(stroka),
+                        valueInputOption="RAW",
+                        body={'values' : tovar }).execute()
+                        
+                        # resp = spreadsheet_service.spreadsheets().values().update(
+                        # spreadsheetId=spreadsheet_id,
+                        # range="Export Products Sheet!Z{}".format(stroka),
+                        # valueInputOption="RAW",
+                        # body={'values' : valuesss }).execute()
+
+
+if __name__ == '__main__':
+    files_tovar()
+    catalog()
+    prov_pagen()
+    urls_tovar()
+    files_jso()
+    print('[*] Проверяю что осталось в файле')
+    time.sleep(10)
+    check_finish1 = True
+    while check_finish1:
+        a = len(re.findall(r"[\n]+?", open('./urltovar.txt').read()))
+        print(a)
+        if a > 1:
+            try:            
+                loop = asyncio.get_event_loop()
+                loop.run_until_complete(main1())
+                check_finish1 = False
+            except Exception:
+                print('[*]Новое подключние')
+                time.sleep(10)
+        else: 
+            break
+    # loop = asyncio.get_event_loop()
+    # loop.run_until_complete(main1())
+    files_js()
+    # try:
+    #     search_id()
+    # except Exception as e:
+    #     print(e)
+    #     print('[*] Не залил весь товар.')
+    # # 
+    # try:
+    #     finds_ids()
+    # except Exception as e:
+    #     print(e)
+    #     print('[*] Не проверил весь товар которого не было на сайте.')
+    
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,37 @@
+aiohttp==3.8.3
+aiosignal==1.3.1
+async-timeout==4.0.2
+attrs==22.2.0
+beautifulsoup4==4.11.1
+cachetools==5.3.0
+certifi==2022.12.7
+charset-normalizer==2.1.1
+et-xmlfile==1.1.0
+frozenlist==1.3.3
+google-api-core==2.11.0
+google-api-python-client==2.74.0
+google-auth==2.16.0
+google-auth-httplib2==0.1.0
+google-auth-oauthlib==0.8.0
+googleapis-common-protos==1.58.0
+httplib2==0.21.0
+idna==3.4
+multidict==6.0.4
+numpy==1.24.1
+oauthlib==3.2.2
+openpyxl==3.0.10
+pandas==1.5.3
+protobuf==4.21.12
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pytz==2022.7.1
+requests==2.28.2
+requests-oauthlib==1.3.1
+rsa==4.9
+six==1.16.0
+soupsieve==2.3.2.post1
+uritemplate==4.1.1
+urllib3==1.26.14
+yarl==1.8.2