Wyrażenia regularne

Wyrażenia regularne w Pythonie

Wyrażenia regularne (regex) to potężne narzędzie do wyszukiwania, dopasowywania i manipulacji tekstem.

Moduł re:

Python używa modułu re do pracy z wyrażeniami regularnymi.

import re

# Podstawowe wyszukiwanie
text = "Python jest świetnym językiem programowania"
pattern = r"Python"

# Sprawdzanie czy wzorzec występuje
if re.search(pattern, text):
    print("Znaleziono 'Python'")

# Znajdowanie wszystkich wystąpień
matches = re.findall(pattern, text)
print(matches)  # ['Python']

Podstawowe wzorce:

import re

text = "Mam 25 lat, telefon: 123-456-789, email: jan@example.com"

# Cyfry
pattern = r"\d+"  # Jedna lub więcej cyfr
numbers = re.findall(pattern, text)
print(numbers)  # ['25', '123', '456', '789']

# Litery
pattern = r"[a-zA-Z]+"  # Jedna lub więcej liter
words = re.findall(pattern, text)
print(words)  # ['Mam', 'lat', 'telefon', 'email', 'jan', 'example', 'com']

# Słowa (litery i cyfry)
pattern = r"\w+"  # Jedna lub więcej znaków alfanumerycznych
words = re.findall(pattern, text)
print(words)  # ['Mam', '25', 'lat', 'telefon', '123', '456', '789', 'email', 'jan', 'example', 'com']

# Spacje
pattern = r"\s+"  # Jedna lub więcej spacji
spaces = re.findall(pattern, text)
print(spaces)  # [' ', ' ', ' ', ' ', ' ', ' ']

Znaki specjalne:

import re

text = "Cena: $19.99, Rabat: 20%, Kod: ABC-123"

# Znaki specjalne - trzeba je eskejpować
pattern = r"\$\d+\.\d+"  # Cena w dolarach
prices = re.findall(pattern, text)
print(prices)  # ['$19.99']

# Procenty
pattern = r"\d+%"  # Liczba + znak %
percentages = re.findall(pattern, text)
print(percentages)  # ['20%']

# Kody z myślnikami
pattern = r"[A-Z]{3}-\d{3}"  # 3 litery + myślnik + 3 cyfry
codes = re.findall(pattern, text)
print(codes)  # ['ABC-123']

# Różne znaki
pattern = r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"  # Email
emails = re.findall(pattern, text)
print(emails)  # [] (brak emaili w tym tekście)

Kwantytory (ilości):

import re

text = "a aa aaa aaaa aaaaa"

# Różne kwantytory
patterns = [
    r"a",      # Jedna litera 'a'
    r"a+",     # Jedna lub więcej 'a'
    r"a*",     # Zero lub więcej 'a'
    r"a?",     # Zero lub jedna 'a'
    r"a{2}",   # Dokładnie 2 'a'
    r"a{2,}",  # 2 lub więcej 'a'
    r"a{2,4}"  # Od 2 do 4 'a'
]

for pattern in patterns:
    matches = re.findall(pattern, text)
    print(f"{pattern}: {matches}")

# Wynik:
# a: ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a']
# a+: ['a', 'aa', 'aaa', 'aaaa', 'aaaaa']
# a*: ['a', ' ', 'aa', ' ', 'aaa', ' ', 'aaaa', ' ', 'aaaaa', '']
# a?: ['a', '', 'a', 'a', '', 'a', 'a', 'a', '', 'a', 'a', 'a', 'a', '', 'a', 'a', 'a', 'a', 'a', '']
# a{2}: ['aa', 'aa', 'aa', 'aa']
# a{2,}: ['aa', 'aaa', 'aaaa', 'aaaaa']
# a{2,4}: ['aa', 'aaa', 'aaaa', 'aaaa']

Grupy i referencje:

import re

text = "Jan Kowalski, Anna Nowak, Piotr Wiśniewski"

# Grupy - nawiasy okrągłe
pattern = r"(\w+) (\w+)"  # Imię i nazwisko
matches = re.findall(pattern, text)
print(matches)  # [('Jan', 'Kowalski'), ('Anna', 'Nowak'), ('Piotr', 'Wiśniewski')]

# Grupy z nazwami
pattern = r"(?P\w+) (?P\w+)"
matches = re.finditer(pattern, text)
for match in matches:
    print(f"Imię: {match.group('imie')}, Nazwisko: {match.group('nazwisko')}")

# Referencje do grup
text = "Python 3.9, Python 3.10, Python 3.11"
pattern = r"Python (\d+\.\d+)"
matches = re.findall(pattern, text)
print(matches)  # ['3.9', '3.10', '3.11']

# Zamiana z referencjami
new_text = re.sub(pattern, r"Python wersja \1", text)
print(new_text)  # Python wersja 3.9, Python wersja 3.10, Python wersja 3.11

Funkcje modułu re:

import re

text = "Python jest świetny! Python jest potężny! Python jest prosty!"

# re.search() - pierwsze wystąpienie
match = re.search(r"Python", text)
if match:
    print(f"Znaleziono na pozycji: {match.start()}-{match.end()}")

# re.findall() - wszystkie wystąpienia
matches = re.findall(r"Python", text)
print(f"Liczba wystąpień: {len(matches)}")  # 3

# re.finditer() - iterator z obiektami Match
for match in re.finditer(r"Python", text):
    print(f"Pozycja: {match.start()}-{match.end()}, Tekst: {match.group()}")

# re.sub() - zamiana
new_text = re.sub(r"Python", "JavaScript", text)
print(new_text)  # JavaScript jest świetny! JavaScript jest potężny! JavaScript jest prosty!

# re.split() - podział
parts = re.split(r"!", text)
print(parts)  # ['Python jest świetny', ' Python jest potężny', ' Python jest prosty', '']

# re.match() - dopasowanie na początku
if re.match(r"Python", text):
    print("Tekst zaczyna się od 'Python'")

Flagi (flags):

import re

text = "Python PYTHON python"

# re.IGNORECASE (re.I) - ignoruj wielkość liter
pattern = r"python"
matches = re.findall(pattern, text, re.IGNORECASE)
print(matches)  # ['Python', 'PYTHON', 'python']

# re.MULTILINE (re.M) - dopasowanie do początku linii
text = """Pierwsza linia
Druga linia
Trzecia linia"""

pattern = r"^\w+"  # Słowo na początku linii
matches = re.findall(pattern, text, re.MULTILINE)
print(matches)  # ['Pierwsza', 'Druga', 'Trzecia']

# re.DOTALL (re.S) - kropka dopasowuje też nowe linie
text = "Linia 1\nLinia 2\nLinia 3"
pattern = r".*"  # Wszystko
matches = re.findall(pattern, text, re.DOTALL)
print(matches)  # ['Linia 1\nLinia 2\nLinia 3', '']

# Kombinacja flag
pattern = r"python"
matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)

Praktyczne przykłady:

import re

# Walidacja emaila
def is_valid_email(email):
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(pattern, email))

emails = ["jan@example.com", "invalid-email", "test@domain.co.uk", "user@.com"]
for email in emails:
    print(f"{email}: {is_valid_email(email)}")

# Wyodrębnianie dat
text = "Spotkanie 15.03.2024, Deadline: 2024-12-31, Data: 2024/06/15"
pattern = r"\d{2}\.\d{2}\.\d{4}|\d{4}-\d{2}-\d{2}|\d{4}/\d{2}/\d{2}"
dates = re.findall(pattern, text)
print(f"Znalezione daty: {dates}")

# Wyodrębnianie numerów telefonów
text = "Tel: 123-456-789, Mobile: +48 987 654 321, Fax: (22) 123-45-67"
pattern = r"(?:\+\d{2} )?(?:\(\d{2}\) )?\d{3}[\- ]?\d{3}[\- ]?\d{3}"
phones = re.findall(pattern, text)
print(f"Numery telefonów: {phones}")

# Czyszczenie tekstu
text = "   Python   jest   świetny!   "
# Usuń nadmiarowe spacje
cleaned = re.sub(r"\s+", " ", text).strip()
print(f"Przed: '{text}'")
print(f"Po: '{cleaned}'")

# Wyodrębnianie słów w cudzysłowach
text = 'Powiedział "Hello world" i "Python jest super"'
pattern = r'"([^"]*)"'
quotes = re.findall(pattern, text)
print(f"Teksty w cudzysłowach: {quotes}")

Dobre praktyki:

Używaj surowych stringów (r”…”) dla wzorców
Eskejpuj znaki specjalne (\d, \w, \s)
Używaj grup do wyodrębniania części dopasowania
Testuj wzorce na różnych danych
Używaj flag dla lepszej kontroli
Unikaj zbyt skomplikowanych wzorców
Dokumentuj wzorce regex

Ćwiczenie:

Stwórz funkcję walidującą różne formaty danych (email, telefon, kod pocztowy, PESEL).

Uwaga:

Wyrażenia regularne mogą być skomplikowane i niektóre wzorce mogą nie działać poprawnie w edytorze online. Zalecane jest testowanie regex w lokalnym środowisku Python.