python標準庫之re模塊

本文以面試題的方式介紹re模塊的部分用法

groups()

# 1.字母和數字表示他們自身。一個正則表達式模式中的字母和數字匹配同樣的字符串。
re.match('www', 'www.runoob.com')
a.group() # 'www'
a.group(0) # 'www' 等價於a.group()
a.groups() # ()
a.group(1) # IndexError: no such group

# 2.用來表示一組字符,單獨列出：[amk] 匹配 'a'，'m'或'k'
re.match('[www]', 'www.runoob.com')  # [www]]等價於[w]，一般不會這樣寫
a.group() # 'w'
a.group(0) # 'w' 等價於a.group()
a.groups() # ()
a.group(1) # IndexError: no such group

# 3.(re)對正則表達式分組並記住匹配的文本
re.match('(www)', 'www.runoob.com')
a.group() # 'www'
a.group(0) # 'www' 等價於a.group()
a.groups() # ('www',)
a.group(1) # 'www'

# 4.分多組匹配
a = re.match( r'(\w*)\.(\w*)\.(\w*)', 'www.runoob.com')
a.group() # 'www.runoob.com'
a.group(0) # 'www.runoob.com' 等價於a.group()
a.groups() # ('www', 'runoob', 'com')
a.group(1) # 'www'
a.group(2) # 'runoob'
a.group(3) # 'com'
a.group(4) # IndexError: no such group

# 5.貪婪模式（此處匹配最後一個.之前所有字符串）
a = re.match( r'(.*)\..*', 'Www.runoob.com', re.I)
a.group() # 'www.runoob.com'
a.groups() # ('www.runoob',)

# 6.非貪婪模式*?（此處匹配第一個.之前所有字符串）
a = re.match( r'(.*?)\..*', 'Www.runoob.com', re.I)
a.group() # 'www.runoob.com'
a.groups() # ('www',)

# 7.'(?P...)' 分組匹配
s = '1102231990xxxxxxxx'
a= re.search('(?P<province>\d{3})(?P<city>\d{3})(?P<born_year>\d{4})',s)
a.group() # '1102231990'
a.groups() # ('110', '223', '1990')
a.groupdict() # {'province': '110', 'city': '223', 'born_year': '1990'}

正則表達式匹配中，（.）和（.?）匹配區別？

（.*）是貪婪匹配，會把滿足正則的儘可能多的往後匹配
（.*?）是非貪婪匹配，會把滿足正則的儘可能少匹配

re.split

s=“info:xiaoZhang 33 shandong”,用正則切分字符串輸出[‘info’, ‘xiaoZhang’, ‘33’, ‘shandong’]

import re
s="info:xiaoZhang 33 shandong"
# |表示或，根據冒號或者空格切分
re.split(r':| ',s)  # ['info', 'xiaoZhang', '33', 'shandong']

re.search

<div class="nam">中國</div>，用正則匹配出標籤裏面的內容（“中國”），其中class的類名是不確定的

 s='<div class="nam">中國</div>'
 a=re.search(r'<div class=".*">(.*)</div>',s)
 a.groups() # ('中國',)

re.sub

字符串a = “not 404 found 張三 99 深圳”，每個詞中間是空格，用正則過濾掉英文和數字，最終輸出"張三深圳"

a = "not 404 found 張三 99 深圳"
b=re.sub(r'[0-9a-zA-Z]+\s','',a)  # 將任意字母或數字且後面帶有空白符的字串替換爲''
# '張三 深圳'

a=“張明 98分”，用re.sub，將98替換爲100

>>> a="張明 98分"
>>> b=re.sub(r'\d+','100',a)
>>> b
'張明 100分'

替換的參數可以是一個函數

# 將匹配的數字乘以 2
s = 'A23G4HFD567'

def double1(matched):
    value = int(matched.group(1))
    return str(value*2)
print(re.sub('(\d+)', double1, s)) # A46G8HFD1134

# 或利用'(?P...)' 分組匹配
def double2(matched):
    value = int(matched.group('value'))
    return str(value * 2)
print(re.sub('(?P<value>\d+)', double2, s)) # A46G8HFD1134

# lambda
print(re.sub('(\d+)', lambda x:str(int(x.group(1))*2),s)) # A46G8HFD1134

# 提取數字：用數字替換查找的文本   如：找到'A23',組內容爲'23',用'23'替換'A23'；然後繼續下一次操作
>>> a=re.sub(r'[a-zA-Z]*(\d+)',lambda x:x.group(1),s)
>>> a
234567

re.findall

正則匹配，匹配日期2018-03-20
url=‘https://sycm.taobao.com/bda/tradinganaly/overview/get_summary.json?dateRange=2018-03-20%7C2018-03-20&dateType=recent1&device=1&token=ff25b109b&_=1521595613462’

>>> url='https://sycm.taobao.com/bda/tradinganaly/overview/get_summary.json?dateRange=2018-03-20%7C2018-03-20&dateType=recent1&device=1&token=ff25b109b&_=1521595613462'
>>> a=re.findall(r'\d{4}\-\d{2}\-\d{2}',url)
>>> a
['2018-03-20', '2018-03-20'] # findall返回的是list

re.compile

正則匹配中文

>>> title = '你好，hello，世界！'
>>> p = re.compile(r'[\u4e00-\u9fa5]+')  # 或re.compile(r'[一-龥]+')  這兩個unicode值正好是Unicode表中的漢字的頭和尾。
>>> p.findall(title)
['你好', '世界']

正則re.complie作用？
re.compile是將正則表達式編譯成一個對象，加快速度，並重複使用

\1…\9 匹配第n個分組的內容

正則表達式匹配出<html><h1>www.itcast.cn</h1></html>

>>> label='<html><h1>www.itcast.cn</h1></html>'
>>> a=re.match(r'<(\w*)><(\w*)>.*</\2></\1>',label)   # \2匹配第2組的內容  \1匹配第1組的內容
>>> a.group()
'<html><h1>www.itcast.cn</h1></html>'
>>> a.groups()
('html', 'h1')

>>> label='<html><h1>www.itcast.cn</h2></html>'
>>> a=re.match(r'<(\w*)><(\w*)>.*</\2></\1>',label)
>>> a.group()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: 'NoneType' object has no attribute 'group'

其他正則排序面試題

A. 有字符串 A = ‘a33aa2a3aa5aa6aaa3aaa7aaa8aaa4’
求出所有含有一個或者多個 ‘a’ 子串加數字子串的子字符串(如：‘a33’, 'aa2’等)，並排序，排序要求：
（1）含 ‘a’ 少的子字符串放在前，含 ‘a’ 多的子字符串放在後面；
（2）含 ‘a’ 子串數相同的再比較子串後面的數字大小，按從大到小排序。

>>> A =  'a33aa2a3aa5aa6aaa3aaa7aaa8aaa4'
>>> l = re.findall(r'a+\d+', A)
>>> sorted(l,key=lambda x:(x.count('a'), -int(x[x.count('a'):])))
['a33', 'a3', 'aa6', 'aa5', 'aa2', 'aaa8', 'aaa7', 'aaa4', 'aaa3']

sorted() 中key的排序規則可以有多個，比如：
1）x.count(‘a’) 以字母 a 的個數大小排序；
2）-int(x[x.count(‘a’):]) 以字母 a 後面數字的大小排序，加負號則按逆序排列；
3）默認從小到大排序；

B. 去除以下html文件中的標籤，只顯示文本信息

s = """<div>\
<p>崗位職責：</p>\
<p>完成推薦算法、數據統計、接口、後臺等服務器端相關工作</p>\
<p><br></p>\
<p>必備要求：</p>\
<p>良好的自我驅動力和職業素養，工作積極主動、結果導向</p>\
<p>&nbsp;<br></p>\
<p>技術要求：</p>\
<p>1、一年以上 Python 開發經驗，掌握面向對象分析和設計，瞭解設計模式</p>\
<p>2、掌握HTTP協議，熟悉MVC、MVVM等概念以及相關WEB開發框架</p>\
<p>3、掌握關係數據庫開發設計，掌握 SQL，熟練使用 MySQL/PostgreSQL 中的一種<br></p>\
<p>4、掌握NoSQL、MQ，熟練使用對應技術解決方案</p>\
<p>5、熟悉 Javascript/CSS/HTML5，JQuery、React、Vue.js</p>\
<p>&nbsp;<br></p>\
<p>加分項：</p>\
<p>大數據，數理統計，機器學習，sklearn，高性能，大併發。</p>\
</div> """

a=re.sub(r'</?\w+>','',s)

C. 將以下網址提取出域名

url = 'http://www.interoem.com/messageinfo.asp?id=35'
a=re.match(r'http[s]?://.+?/',url)
a.group() # http://www.interoem.com/

url = 'http://www.interoem.com/messageinfo.asp?id=35,https://3995503.com/class/class09/news_show.asp?id=14,http://lib.wzmc.edu.cn/news/onews.asp?id=769'
a=re.findall(r'(http[s]?://.+?/)', url) # ['http://www.interoem.com/', 'https://3995503.com/', 'http://lib.wzmc.edu.cn/']

D. 提取出如下字符串中的單詞

>>> s='hello world ha ha'
>>> s.split(' ')
['hello', 'world', 'ha', 'ha']

>>> a=re.findall(r'\b[a-zA-Z]+\b', s)
['hello', 'world', 'ha', 'ha']

\b：匹配一個單詞邊界，也就是指單詞和空格間的位置。例如， ‘er\b’ 可以匹配"never" 中的 ‘er’，但不能匹配 “verb” 中的 'er

python標準庫之re模塊