Example of Email Validation using Regex in Python and DNS

 First simple method Regex

The re module is very useful for this purpose. Here’s a common regular expression pattern that captures most valid email addresses:

  • The email starts with a combination of letters, digits, dots, hyphens, or underscores.
  • Followed by the @ symbol.
  • Followed by the domain name, which is a combination of letters, digits, dots, and hyphens.
  • The domain ends with a top-level domain (TLD) which is at least two characters long.

 

 contacts = Contact.objects.filter(is_checked=True, email__contains='info@', for_mailing=True)
        # Select all contacten dat groen staan voor mailing
        for contact in contacts:
            if self.validate_email(contact.email):
                print("Valid")
            else:
                print("Not valid")
                contact.is_checked = False
                contact.for_mailing = False
                contact.save()


    def validate_email(self, email):
        # Regular expression for validating an Email
        email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

        # Match the input email against the regular expression
        if re.match(email_regex, email):
            return True
        else:
            return False

 Email-validator  python lib via DNS 

emailinfo = validate_email(email, check_deliverability=False)

 not worked in my setup..

Self-written script

You can then create a command that visits the specified website to check if the domain is still active and to locate an email address on the contact page. If you need to verify company emails, validate personal emails, or perform real-time email validation within your application, you could use an API like SendGrid's.

services = ["skynet.be", "gmail.com", "hotmail.com", "live.be",
 "telenet.be", "outlook.com", "busmail.net", "yahoo"]
class Command(BaseCommand):


def handle(self, *args, **options):
'''
'''

contacts = Contact.objects.filter(is_checked=False)
while contacts.count() > 0:
print contacts.count()
for contact in contacts[:100]:
self.check_contact(contact)
contacts = Contact.objects.filter(is_checked=False)




def check_contact(self, contact):
email = None
domain = contact.email.split('@')[-1]
website = "http://%s" % domain
print website
old_email = contact.email
try:
response = requests.get(website, verify=False, timeout=10)
parsed_uri = urlparse(response.url)
domain = parsed_uri.netloc.replace('www.', '')
if response.status_code == 200:
contact.website_working = True
soup = BeautifulSoup(response.text, "lxml")
#import pdb; pdb.set_trace()
email_from_site = self.get_email(soup, domain)
if email_from_site:
email = email_from_site
else:
contact_url = self.get_contact_url(soup, parsed_uri)
if contact_url:
try:
response = requests.get(website, verify=False, timeout=10)
soup = BeautifulSoup(response.text, "lxml")
email_from_site = self.get_email(soup, domain)
if email_from_site:
email = email_from_site
except:
traceback.print_exc(file=sys.stdout)

except:
traceback.print_exc(file=sys.stdout)
if email:
contact.email_on_website = True
contact.email = email
contact.is_checked = True
print "old email:", old_email, " | ","new email: ", contact.email
 " | ", "website on working: ", contact.website_working, " | ",
 "email on website: ", contact.email_on_website
try:
contact.save()
except:
contact.old_email = True
contact.email = old_email
contact.save()
traceback.print_exc(file=sys.stdout)


def get_email(self, soup, domain):
email = None
email_from_url = self.email_from_url(soup)
if email_from_url and domain in email_from_url:
email = email_from_url
else:
parse_email = self.parse_email(soup)
if parse_email and domain in parse_email:
email = parse_email
#import pdb; pdb.set_trace()
return email



def email_from_url(self, soup):
email = None
href_email = soup.find("a", href=re.compile("mailto:"))
if href_email:
email = href_email.text.strip().lower()
return email


def parse_email(self, soup):
"""
Parse email address from static content
return list with emails
"""
email = None
pattern = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')

emails = re.findall(pattern, soup.prettify())
if len(emails) > 0:
email = emails[0]
return email


def get_contact_url(self, soup, parsed_uri):
contact_url = None
url = soup.find("a", href=re.compile("contact"))
if url and url.get("href", None):
url = url['href']
if parsed_uri.netloc not in url:
if url.startswith("/"):
url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) + url
else:
url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + url
contact_url = url
print url, 1
return contact_url
 
 
 

 Email-address-validation  via API

https://www.twilio.com/docs/sendgrid/ui/managing-contacts/email-address-validation  
 
https://www.twilio.com/docs/sendgrid/api-reference/email-address-validation/validate-an-email
 
 Code example
 
import os
from sendgrid import SendGridAPIClient


sg = SendGridAPIClient(os.environ.get('SENDGRID_API_KEY'))

data = {
"source": "Newsletter",
"email": "example@example.com"
}

response = sg.client.validations.email.post(
request_body=data
)

print(response.status_code)
print(response.body)
print(response.headers) 
 
 
{"result":{"email":"info@domain.nl","verdict":"Risky","score":0.34637,"local":"info","host":"doamin.nl",
"checks":{"domain":{"has_valid_address_syntax":true,"has_mx_or_a_record":true,"is_suspected_disposable_address":false},
"local_part":{"is_suspected_role_address":true},"additional":{"has_known_bounces":false,"has_suspected_bounces":true}},
"source":"NEWSLETTER","ip_address":"176.9.149.242"}}' 
 
In most cases Invalid email has no A or MX records in DNS. 
 
Therefore, we can only validate business emails with this verification 
because they are from a different provider and are literally not in the register. 
 
{"result":{"email":"info@jacobsinfra.nl","verdict":"Invalid","score":0,"local":"info",
"host":"jacobsinfra.nl","checks":
{"domain":{"has_valid_address_syntax":true,
"has_mx_or_a_record":false,"is_suspected_disposable_address":false},"local_part":
{"is_suspected_role_address":false},"additional":
{"has_known_bounces":false,"has_suspected_bounces":false}},
"source":"NEWSLETTER","ip_address":"176.9.149.242"}} 
 
So, you could check his syntax and then split email and use doamin to find 
DNS records if no records email is invalid. 
 
First, you need to install the dnspython library if you haven't already:
 
There are two versions new and old, 
I currently have an old system where I still have 2.7. 
So we write DNS validation with an old lib. 
 
:-/ 
 
AttributeError Traceback (most recent call last)
<ipython-input-5-55aafb908ebf> in <module>()
----> 1 a_records = dns.resolver.resolve(domain, 'A')

AttributeError: 'module' object has no attribute 'resolve' 
 
 
dns.resolver.query new dns.resolver.resolve
 
def check_dns_records(self, domain, contact):
print(domain)
try:
# Check for A record
a_records = dns.resolver.query(domain, 'A')
if not a_records:
msg = "No A records found for domain: " + domain
print(msg)
contact.comment = msg
return False
else:
print("A records for domain " + domain)
for record in a_records:
print(record)

# Check for MX record
mx_records = dns.resolver.query(domain, 'MX')
if not mx_records:
msg = "No MX records found for domain: " + domain
print(msg)
contact.comment = msg
return False
            else:
print("MX records for domain:")
for record in mx_records:
print(record.exchange, record.preference)
return True
except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.NoNameservers, dns.exception.Timeout, dns.name.EmptyLabel) as e:
msg = "Error checking DNS records for domain " + domain
print(msg)
contact.comment = msg
return False 

Bonus funciton to generate name from email address

def split_email(self, email):
        # Extract the part before '@'
        local_part = email.split('@')[0]

        # Extract the domain part after '@'
        domain_part = email.split('@')[1]

        # Remove the domain extension
        domain_without_extension = '.'.join(domain_part.split('.')[:-1])

        # Replace hyphens with spaces and capitalize each word
        formatted_domain = ' '.join(word.capitalize() for word in re.split('-|_', domain_without_extension))
        return formatted_domain

 


 



Comments