dorbodwolf dorbodwolf - 4 months ago 19
Python Question

python regex: extract special substring in a hyperlink

I use python grabbed a series of hyperlinks, I want to extract specific character string from these hyperlinks.
the hyperlinks like below:
http://tianqi.2345.com/hongkong/61063.htm

it contains a city name(hongkong) and a city ID(61063), I want to get the result below:

cityName=hongkong
cityID = 61063


my sample code is below:

import re
reNamedGroupTestStr = 'http://tianqi.2345.com/qinxian/61063.htm'
foundTagA = re.search('http://tianqi.2345.com/(?P<CityName>.+?)/(?P<CityID>.+?).htm", reNamedGroupTestStr);
if(foundTagA):
GroupCityName = foundTagA.group("CityName");
print "CityName=",GroupCityName; #I wish to print 'hongkong'
GroupCityID = foundTagA.group("CityID");
print "CityID=",GroupCityID; #I wish to print '61063'


but the code throws bug, I am not familiar with regex, can anyone help me?

below is my full code:

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(open("countyID.html"), "lxml")
#print(soup.prettify())
i = 0
for tag in soup.select('div.bmeta'):
if i == 5:
countys = tag
i = i + 1


for county in countys.find_all('a'):
countyid = county.get('href')
print county.get_text() #Print the city Chinese Name
print countyid[23:-10] #print the cityName
print countyid[-9:-4] #print the cityID
print '***'
#break
'''
the sample print result:
***
台北 #Print the city Chinese Name
taipei #print the cityName
71294 #print the cityID
***
'''

#test regex(corrected)
reNamedGroup = 'http://tianqi.2345.com/qinxian/61063.htm'
foundTagA = re.search('http://tianqi.2345.com/(?P<CityName>\w+?)/(?P<CityID>\d+?).htm', reNamedGroup)
if(foundTagA):
GroupCityName = foundTagA.group("CityName");
print "CityName=",GroupCityName; #I wish to print 'hongkong'
GroupCityID = foundTagA.group("CityID");
print "CityID=",GroupCityID; #I wish to print '61063'

Answer

You can just split:

u = "http://tianqi.2345.com/hongkong/61063.htm"


_, nme, c_id = u.rsplit("/", 2)
print(nme, c_id.split(".", 1)[0])

Which will give you:

hongkong 61063

If you want to check if the url startswith the host:

if u.startswith("http://tianqi.2345.com/"):
     _, nme, c_id = u.rstrip(".htm").rsplit("/", 2)

Since you are using BeautifulSoup, you can filter the anchor tags yourself using the id of the div that contains the links:

from bs4 import BeautifulSoup

import requests

soup = BeautifulSoup(requests.get("http://tianqi.2345.com/").content)

for a in soup.select("#hot_l a[href]"):
    _, nme, c_id = a["href"].rsplit("/", 2)
    print(nme, c_id.split(".", 1)[0])

Which gives you:

beijing 54511
shanghai 58362
tianjin 54527
hangzhou 58457
guangzhou 59287
chengdu 56294
xian 57036
nanjing 58238
shenzhen 59493
chongqing 57516
anshan 54339
beidaihe 71098
dongguan 59289
changsha 57687
shenyang 54342
xiamen 59134
wuhan 57494
haikou 59758
wulumuqi 51463
qingdao 54857
foshan 59288
ganzhou 57993
hefei 58321
dalian 54662
haerbin 50953
changchun 54161
nanning 59431
guiyang 57816
fuzhou 58847
zhengzhou 57083
jinan 54823
guilin 57957
taiyuan 53772
nanchang 58606
lasa 55591
hengyang 57872
jiuzhaigou 60925
lian 58044
xining 52866
xianggang 45007
aomen 45011
taipei 71294
dali 56751
wuxi 58354
xuzhou 58027
yangzhou 58245
jiujiang 58502
luoyang 57073
nantong 58259
ningbo 58465
rizhao 54945