#StackBounty: #python #python-3.x Scraping webelements if found

Bounty: 100

Im currently working doing a webscrape which in my case if sydsvenskan but will be a template over time for other sites.

I have currently done:

from typing import Optional

import attr
import requests
from bs4 import BeautifulSoup
from loguru import logger
from requests import RequestException
from requests.exceptions import (
    ConnectionError,
    ConnectTimeout,
    ProxyError,
    ReadTimeout,
    ChunkedEncodingError,
    SSLError,
    Timeout
)


@attr.dataclass
class Data:
    store: str = attr.ib(factory=str)
    name: Optional[str] = attr.ib(factory=str)
    info: Optional[str] = attr.ib(factory=str)
    image: Optional[str] = attr.ib(factory=str)


class Info:
    def from_page(url: str) -> Data:
        while True:
            try:
                with requests.get(url) as response:
                    if response.status_code == 404:
                        return Data(store="Sydsvenskan")
                    if response.ok:
                        doc = BeautifulSoup(response.text, 'html.parser')
                    else:
                        response.raise_for_status()

                name = doc.select_one('span.prose-title')
                info = doc.select_one('div.article__preamble-wrapper')
                image = doc.select_one('img.article-image')

                return Data(
                    store="Sydsvenskan",
                    name=name.text.strip() if name else '',
                    info=info.text.strip() if info else '',
                    image=image['src'] if image else '',
                )
            except (
                    ReadTimeout,
                    Timeout,
                    ConnectTimeout,
                    ConnectionError,
                    ChunkedEncodingError,
                    SSLError,
                    ProxyError
            ) as err:
                logger.info(f"{type(err).__name__} at line {err.__traceback__.tb_lineno} of {__file__}: {err}")
                continue

            except RequestException as err:
                logger.exception(f"{type(err).__name__} at line {err.__traceback__.tb_lineno} of {__file__},: {err}")
                raise RequestException from err

            except Exception as err:
                logger.exception(f"{type(err).__name__} at line {err.__traceback__.tb_lineno} of {__file__},: {err}")
                raise Exception from err


def main():
    info = Info.from_page(
        url="https://www.sydsvenskan.se/2021-07-14/flera-anhallna-for-senaste-tidens-skjutningar-i-malmo")
    if info is None:
        logger.info('No new payload')
    else:
        logger.info(f'New payload: {info}')


if __name__ == '__main__':
    main()

The current "template" I have created is meant to be able to add more news sites in the future and to be able to use this as a template. We do know that each site is different so we will never have any similar but the template should make it easy to adept in the future.

I wonder if there is anything I can improve from here? I am mostly "worried" about the return of Data where I use the if else statement inside the dataclass, not sure if that is a correct way to do it?


Get this bounty!!!

#StackBounty: #python #python-3.x #wxpython wxPython application does not respond

Bounty: 50

I have a complex application, with a GUI that needs to dialogue with some I/O devices and with some WebAPI. I put my wx.Frame class in the main file, as I read that the GUI should be in the main thread to avoid freezing

if __name__ == "__main__":
    app = wx.App()
    frame = Window()
    app.MainLoop()

but still the GUI freezes very often, and sometimes it doesn’t show at all and a message saying "My_app is not responding" appears.
All the I/O and webAPI management is done in separate threads that are created by frame. The only GUI elements that are not in the main file are the pages that compose my notebook

from PageOne import PageOne
from PageTwo import PageTwo
from PageThree import PageThree

...

self.page1 = PageOne(self.nb)
self.page2 = PageTwo(self.nb)
self.page3 = PageThree(self.nb)

self.nb.AddPage(self.page1, "Page1")
self.nb.AddPage(self.page2, "Page2")
self.nb.AddPage(self.page3, "Page3")

All the communications between secondary threads and the GUI are done using wx.lib.newevent.NewEvent() in the main file and wx.PostEvent(self.parent, my_evt) in the threads.

I am using wxpython 4.1.1 and Ubuntu 20.04.2 LTS.

Any suggestion on how to prevent the GUI from not responding or freezing? Is maybe a better idea to use multiprocessing instead of multithreading? I know that threads are usually better for I/O applications…but is it still true in my case where the threads are all enless loops?

def run(self):
    while True:
        do_stuff()


Get this bounty!!!

#StackBounty: #python-3.x #re Issue with re.sub in Python3 and not in Python2

Bounty: 50

I’ve got an old script in Python 2.7 that runs a re.sub process correctly. However when I try to use it in Python 3 I get TypeError: expected string or bytes-like object

The relevant code is

substitution_array=[
    [r"^Map From GroupLayer","Add Map GroupLayer"],[r"^Map From","Add Map Auto Layer"]
    ,[r"^s+Papersizes+.*",""],[r"^Set Window.*",""],[r"^Open Window.*",""]]

for row in substitution_array:
        print(row[0])
        for x in newfile:
          line = re.sub(row[0],row[1],x)
          line2=filter(line.strip, line)
          newfile2.append(line2)
        print ("Finished: "+row[0])
        newfile=newfile2
        newfile2=[]

I get the following output

G:GIS_TablesVector_DataAdministrativeCadastreRoad_Reserves>python3 Create_MB_from_WOR.py
--- Table Name: Road_Reserves
^Map From GroupLayer
Finished: ^Map From GroupLayer
^Map From
Traceback (most recent call last):
  File "Create_MB_from_WOR.py", line 43, in <module>
    line = re.sub(row[0],row[1],x)
  File "C:OSGeo4W64appsPython37libre.py", line 192, in sub
    return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or bytes-like object

So it is failing on ,[r"^Map From","Add Map Auto Layer"] and when I delete this it fails on the next one as well.

I had a look at https://docs.python.org/3/library/re.html and think that I have escaped things correctly but what’s wrong here?

Here’s the same code running on the same data in Python 2.7 correctly
enter image description here


Get this bounty!!!

#StackBounty: #python #python-3.x #database #postgresql #validation How to validate row if it's compliant before pushing it to pgsql?

Bounty: 50

Say table structure is like:

CREATE TABLE foo (
    id INTEGER  NOT NULL,
    enter_time TIMESTAMP NOT NULL,
    comment TEXT
);   

Now in python, say I get data like this:

foo_user = {"id": 123, "enter_time": None, "comment": ''}

How can I manually validate this data before sending this to pgsql?

Is there any library which already do this by pulling schema information from pgsql and doing validation on that?


Get this bounty!!!

#StackBounty: #python-3.x #tkinter How to resize Canvas scrollable widget?

Bounty: 50

The idea is that the scrollable canvas and its text widgets grow or fill the entire root/toplevel when I resize it.

I can do this if I work on Frames but for a scrollable frame you need to create a canvas widget and make it scrollable. Now I don’t know if the problem is the canvas or the inserted widgets on the canvas?

import tkinter as tk
from tkinter import ttk

class ScrollableFrame():
    def __init__(self, container, *args, **kwargs):
        self.container = container
        self.canvas = tk.Canvas(self.container, bg="green")
        self.scrollbar = ttk.Scrollbar(self.container, orient="horizontal", command=self.canvas.xview)
        self.scrollable_frame = tk.Frame(self.canvas)
        self.scrollable_frame.grid(sticky="wesn")
        self.scrollable_frame.bind("<Configure>", lambda e: self.canvas.configure(scrollregion=self.canvas.bbox("all")))

        self.canvas.create_window((0, 0), window=self.scrollable_frame, anchor="nw")
        self.canvas.configure(xscrollcommand=self.scrollbar.set)
        
        self.canvas.grid(row=0, column=0, sticky="wesn")
        self.scrollbar.grid(row=1, column=0, sticky="wesn")
        
if __name__ == "__main__":
    root = tk.Tk()
    root.configure(bg="grey20")
    
    s = ScrollableFrame(root)
    
    t = tk.Text(s.scrollable_frame)
    t.grid(row=0, column=0, sticky="wesn")
    
    t2 = tk.Text(s.scrollable_frame)
    t2.grid(row=0, column=1, sticky="wesn")

    root.mainloop()
    

I’m glad for help


Get this bounty!!!

#StackBounty: #python-3.x #nlp #fuzzy-search #fuzzy #fuzzy-comparison Fuzzy matching with long sentence(s)

Bounty: 50

suppose I have the following dataframe:

ID       CompanyName         JobDescription
1        Green Grass LLC     "In the centre of Green Grass area..."
2        Johnny Inc.          "Johnny is currently looking for data analist that..."
3        Liamloy             "LiamLoy Corp. is established in New York..."
4        KaasKan             "In the forest we are walking..."

My main goal is to exclude the CompanyName in each JobDescription. The desired output would be:

ID       CompanyName         JobDescription
1        Green Grass LLC     "In the centre of area..."
2        Johnny Inc.          "is currently looking for data analist that..."
3        Liamloy             "is established in New York..."
4        KaasKan             "In the forest we are walking"

I have tried to word tokenize the JobDescription (convert the sentence in to words) and apply fuzzymatching to detect and remove the matches. However, this was not very successful. For example, when tokenizing the third JobDescription. "Liamloy" is compared to "LiamLoy" and "Corp.". Maybe this approach is not ideal. I have no idea at this point. I wonder if any of you would like to share their opinion and enlighten me how I can succesfully remove the CompanyName in each JobDescription.


Get this bounty!!!

#StackBounty: #python #performance #algorithm #python-3.x #numpy A Prime-Generating Algorithm and Python Script

Bounty: 100

I was wondering if it would be possible to optimise my python script? It is designed to quickly generate and print all the prime numbers under some inputted number. It is fairly fast currently, and I am wondering about its current time complexity, however, I’m also curious to know if there are further improvements and optimisations that can be done to make it even faster? Additionally, I’ve been trying to implement njit with Numba, however, I’m getting rather nasty-looking errors, and I’m wondering if anyone knows how to fix this, or if implementing Numba is worthwhile to begin with. I’ve also been compiling to C with Nuitka, which has also caused a speed up.

To summarise:

  • What is its current time complexity?
  • Are there any ways to improve it?
  • Could Numba be implemented, and if so, would it be worthwhile with a significant speed up?

I also appreciate any alternatives that are faster and/or are more memory-efficient (as long as ‘memory-efficient’ doesn’t cause a significant increase in completion time). I would like the final list of primes to be sorted if possible.

import numpy as np
import math

def primes(n):
    sieve = np.ones(n // 3 + (n % 6 == 2), dtype = bool)
    sieve[0] = False
    for i in range(math.isqrt(n) // 3 + 1):
        if sieve[i]:
            k = 3 * i + 1 | 1
            a = k * k
            b = 2 * k
            sieve[(a // 3) :: b] = False
            sieve[(a + b * (2 - (i & 1))) // 3 :: b] = False

    return np.r_[2, 3, ((3 * np.nonzero(sieve)[0] + 1) | 1)].tolist()

if __name__ == "__main__": 
    n = int(input("What value do you want to check up to? "))
    print(primes(n))


Get this bounty!!!

#StackBounty: #python #performance #python-3.x #reinventing-the-wheel #numpy A Prime-Generating Algorithm and Python Script

Bounty: 100

I was wondering if it would be possible to optimise my python script? It is designed to quickly generate and print all the prime numbers under some inputted number. It is fairly fast currently, and I am wondering about its current time complexity, however, I’m also curious to know if there are further improvements and optimisations that can be done to make it even faster? Additionally, I’ve been trying to implement njit with Numba, however, I’m getting rather nasty-looking errors, and I’m wondering if anyone knows how to fix this, or if implementing Numba is worthwhile to begin with. I’ve also been compiling to C with Nuitka, which has also caused a speed up.

To summarise:

  • What is its current time complexity?
  • Are there any ways to improve it?
  • Could Numba be implemented, and if so, would it be worthwhile with a significant speed up?

I also appreciate any alternatives that are faster and/or are more memory-efficient (as long as ‘memory-efficient’ doesn’t cause a significant increase in completion time).

import numpy as np
import math

def primes(n):
    sieve = np.ones(n // 3 + (n % 6 == 2), dtype = bool)
    sieve[0] = False
    for i in range(math.isqrt(n) // 3 + 1):
        if sieve[i]:
            k = 3 * i + 1 | 1
            a = k * k
            b = 2 * k
            sieve[(a // 3) :: b] = False
            sieve[(a + b * (2 - (i & 1))) // 3 :: b] = False

    return np.r_[2, 3, ((3 * np.nonzero(sieve)[0] + 1) | 1)].tolist()

if __name__ == "__main__": 
    n = int(input("What value do you want to check up to? "))
    print(primes(n))


Get this bounty!!!

#StackBounty: #python #python-3.x #image-processing #barcode-scanner Module installed but can't find a shared library installed by …

Bounty: 50

I need to install a module pyzbar which depends on another called zbar. According to the pypi link I first need to install zbar via

brew install zbar

then install the pyzbar

pip install pyzbar

Having done of that when trying to run a code based on importing specific part of the module, it gives an error.

from pyzbar.pyzbar import decode

The error is
ImportError: Unable to find zbar shared library

In the pypi project page it says [python 2.7, 3.4, 3.5, 3.6, 3.7] and I am using python 3.9. Could that be the issue? How to check where exactly is the issue coming from?


Get this bounty!!!

#StackBounty: #python #python-3.x #email #mime-types #mime What should be the content type to set for a multipart email after parsing a…

Bounty: 100

I have a multipart email with all types of attachments ie. multiple email, plain text, pdf attachments, inline images and html too. After walking through the different parts of the multipart body and adding some text to the body of the main email, I wish to regenerate the whole email as an original. What should be the correct method to do that. Using python 3.6. Code snippet what I have tried is as follows:

mail_attached_bool = False
new_message1 = email.message.EmailMessage()
attached_bool = False

mhtml = 'Modified html variable'
mbody = 'Modified text variable'

# while parsing the multipart of the raw message: msg
if msg.is_multipart():
    for part in msg.walk():
        if part.get_content_type() == 'multipart/report':
            new_message.attach(mbody)
            if mhtml:            
                new_message.attach(mhtml)

            for rel1 in part.walk():
                if rel1.get_content_type() == 'message/delivery-status':
                    new_message.attach(rel1)
                if rel1.get_content_type() == 'text/rfc822-headers':
                    new_message.attach(rel1)

        if part.get_content_type() in ["multipart/related",
                                       "multipart/mixed"]:
            new_message1.set_type("multipart/related")
            if mhtml:
                new_message1.attach(mhtml)
                print(999999)
            elif mbody:
                if mbody == '':
                    mbody = MIMEText(warning_txt,'plain')
                new_message1.attach(mbody)

        for rel in part.walk():
            mail_attached_bool = False
            attached_bool = False
            print(rel.get_content_type(), '------------cccccccc')
            # other kinds of attachments
            cdispo = str(rel.get('Content-Disposition'))
            attach = re.compile('application/*')
            attachment = attach.search(rel.get_content_type())

            if rel.get_content_type() in ['message/rfc822',]:
                new_message1.set_type('multipart/related')
                print(rel.get_content_type(), '----------content type')
                mail_attached_bool = True
                attached_bool = True
                x += 1
            
            if rel.is_multipart() and rel.get_content_type() 
               not in 
               ["multipart/alternative",
                "message/rfc822"
               ]:
                new_message1.set_type(rel.get_content_type())

            # ignore the first html as its the mail body
            if rel.get_content_type() == "text/html" and cdispo=='None':
                i += 1
                if i == 1 and html_body:
                    continue
                print('i: ',i)
            # ignore the first plain text as its the mail body
            if rel.get_content_type() == "text/plain" and cdispo=='None':
                j += 1
                if j == 1 and text_body:
                    continue
                print('j: ',j)

            #--------------#                    
            if 1:#rel.get_content_type() != 'message/rfc822':#mail_attached_bool is False:
                # has mail content apart from body (ios)
                if rel.get_content_type() == "text/html":
                    new_message1.attach(rel)
                    print(rel.get_filename(),'-----   html  attached')

                if rel.get_content_type() == "text/plain" and 
                   rel.get('Content-Disposition') in [None, "inline"]:
                    new_message1.attach(rel)
                    print('---------------text attachment', 666666)

                if rel.get_content_type() in ['image/png',
                                              'image/jpeg',
                                              'image/jpg'] 
                                              or ('attachment' in cdispo) 
                                    or ('inline' in cdispo) or (attachment):

                    # inline images and text
                    if "inline" in cdispo and 
                       not rel.get_content_type() in [
                           "text/plain",
                       ] 
                           and not attached_bool:
                        attached_bool = True
                        new_message1.attach(rel)
                        
                    if attachment or "attachment" in cdispo and 
                       (not attached_bool) or cdispo == 'None':
                        new_message1.attach(rel)
                        attached_bool = True

                    elif cdispo == 'None' and (not attached_bool):
                        new_message1.attach(rel)
                        print('attaching here')

                if rel.get_content_type() in ['text/calendar']:
                    new_message1.attach(rel)

            if mail_attached_bool:
                new_message1.attach(rel)

        new_message.set_type('multipart/alternative')
        new_message.attach(new_message1)
        if new_message1:
            print('new_message1 exists')
            break

Then send the mail.
When the mail is sent it is attaching the main mail body and its attachment 2 times in the new message object. Why does this happen? What is the correct content type to set for the new mail?


Get this bounty!!!