Merge pull request #97 from pointhi/https

Implementing https rewrite support
This commit is contained in:
Adam Tauber 2014-10-19 12:06:34 +02:00
commit 20400c40c3
40 changed files with 4691 additions and 13 deletions

View File

@ -17,6 +17,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
from os import environ
from os.path import realpath, dirname, join, abspath
from searx.https_rewrite import load_https_rules
try:
from yaml import load
except:
@ -34,7 +35,16 @@ if 'SEARX_SETTINGS_PATH' in environ:
else:
settings_path = join(searx_dir, 'settings.yml')
if 'SEARX_HTTPS_REWRITE_PATH' in environ:
https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
else:
https_rewrite_path = join(searx_dir, 'https_rules')
# load settings
with open(settings_path) as settings_yaml:
settings = load(settings_yaml)
# load https rules only if https rewrite is enabled
if settings.get('server', {}).get('https_rewrite'):
# loade https rules
load_https_rules(https_rewrite_path)

View File

@ -1,14 +1,141 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
import re
from lxml import etree
from os import listdir
from os.path import isfile, join
# https://gitweb.torproject.org/\
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
# HTTPS rewrite rules
https_rules = (
# from
(re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
# to
r'https://\1xkcd.com/'),
(re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
r'https://sslimgs.xkcd.com/'),
)
https_rules = []
# load single ruleset from a xml file
def load_single_https_ruleset(filepath):
ruleset = ()
# init parser
parser = etree.XMLParser()
# load and parse xml-file
try:
tree = etree.parse(filepath, parser)
except:
# TODO, error message
return ()
# get root node
root = tree.getroot()
#print(etree.tostring(tree))
# check if root is a node with the name ruleset
# TODO improve parsing
if root.tag != 'ruleset':
return ()
# check if rule is deactivated by default
if root.attrib.get('default_off'):
return ()
# check if rule does only work for specific platforms
if root.attrib.get('platform'):
return ()
hosts = []
rules = []
exclusions = []
# parse childs from ruleset
for ruleset in root:
# this child define a target
if ruleset.tag == 'target':
# check if required tags available
if not ruleset.attrib.get('host'):
continue
# convert host-rule to valid regex
host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
# append to host list
hosts.append(host)
# this child define a rule
elif ruleset.tag == 'rule':
# check if required tags available
if not ruleset.attrib.get('from')\
or not ruleset.attrib.get('to'):
continue
# TODO hack, which convert a javascript regex group into a valid python regex group
rule_from = ruleset.attrib.get('from').replace('$', '\\')
rule_to = ruleset.attrib.get('to').replace('$', '\\')
# TODO, not working yet because of the hack above, currently doing that in webapp.py
#rule_from_rgx = re.compile(rule_from, re.I)
# append rule
rules.append((rule_from, rule_to))
# this child define an exclusion
elif ruleset.tag == 'exclusion':
# check if required tags available
if not ruleset.attrib.get('pattern'):
continue
exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
# append exclusion
exclusions.append(exclusion_rgx)
# convert list of possible hosts to a simple regex
# TODO compress regex to improve performance
try:
target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
except:
return ()
# return ruleset
return (target_hosts, rules, exclusions)
# load all https rewrite rules
def load_https_rules(rules_path):
# add / to path if not set yet
if rules_path[-1:] != '/':
rules_path += '/'
# search all xml files which are stored in the https rule directory
xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
# load xml-files
for ruleset_file in xml_files:
# calculate rewrite-rules
ruleset = load_single_https_ruleset(ruleset_file)
# skip if no ruleset returned
if not ruleset:
continue
# append ruleset
https_rules.append(ruleset)
print(' * {n} https-rules loaded'.format(n=len(https_rules)))

View File

@ -0,0 +1,17 @@
<!--
This directory contains web site rewriting rules for the
HTTPS Everywhere software, available from
https://www.eff.org/https-everywhere
These rules were contributed to the project by users and aim to
enable routine secure access to as many different web sites as
possible. They are automatically installed together with the
HTTPS Everywhere software. The presence of these rules does not
mean that an HTTPS Everywhere user accessed, or intended to
access, any particular web site.
For information about how to create additional HTTPS Everywhere
rewriting rules to add support for new sites, please see
https://www.eff.org/https-everywhere/rulesets
-->

View File

@ -0,0 +1,56 @@
<!--
For other Microsoft coverage, see Microsoft.xml.
CDN buckets:
- a134.lm.akamai.net
- akam.bing.com
- *.mm.bing.net
Nonfunctional domains:
- m2.cn.bing.com
- origin.bj1.bing.com
- blogs.bing.com
Fully covered domains:
- bing.com subdomains:
- (www.)
- c.bing (tracking beacons)
- cn.bing
- h.bing
- ssl
- testfamilysafety.bing
- udc.bing
- (www.)bing
- *.mm.bing.net
- api.bing.com
-->
<ruleset name="Bing">
<target host="bing.com" />
<target host="*.bing.com" />
<target host="*.mm.bing.net" />
<securecookie host=".*\.bing\.com$" name=".+" />
<rule from="^http://((?:c|cn|h|ssl|testfamilysafety|udc|www)\.)?bing\.com/"
to="https://$1bing.com/" />
<rule from="^http://([^/:@]*)\.mm\.bing\.net/"
to="https://$1.mm.bing.com/"/>
<rule from="^http://([^/:@]*)\.api\.bing\.net/"
to="https://$1.api.bing.com/"/>
</ruleset>

View File

@ -0,0 +1,69 @@
<!--
Nonfunctional domains:
- blog.dailymotion.com
- press.dailymotion.com (shows steaw.com, CN: www.steaw.com)
- proxy-46.dailymotion.com
- publicite.dailymotion.com
- publisher.dailymotion.com (reset)
- vid.ak.dmcdn.net (403, Akamai)
- vid2.ak.dmcdn.net (504, akamai)
Problematic domains:
- ak2.static.dailymotion.com (mismatched, CN: *.dmcdn.net)
- support.dmcloud.net (mismatched, CN: *.zendesk.com)
Partially covered domains:
- (www.)dailymotion.com
- cdn/manifest/video/\w+.mnft 403s
- crossdomain.xml breaks videos
-->
<ruleset name="Dailymotion (default off)" default_off="breaks some embedded videos">
<target host="dailymotion.com" />
<!--
* for cross-domain cookie.
-->
<target host="*.dailymotion.com" />
<!--
https://mail1.eff.org/pipermail/https-everywhere-rules/2012-July/001241.html
-->
<exclusion pattern="^http://(?:www\.)?dailymotion\.com/(?:cdn/[\w-]+/video/|crossdomain\.xml$)" />
<target host="ak2.static.dailymotion.com" />
<target host="*.dmcdn.net" />
<target host="dmcloud.net" />
<target host="*.dmcloud.net" />
<!-- Testing wrt embedded breakage.
securecookie host="^.*\.dailymotion\.com$" name=".+" /-->
<!--
Omniture tracking cookies:
-->
<securecookie host="^\.dailymotion\.com$" name="^s_\w+$" />
<securecookie host="^www\.dailymotion\.com$" name=".+" />
<rule from="^http://(erroracct\.|www\.)?dailymotion\.com/"
to="https://$1dailymotion.com/" />
<rule from="^http://(s\d|static(?:\d|s\d-ssl))\.dmcdn\.net/"
to="https://$1.dmcdn.net/" />
<rule from="^https?://ak2\.static\.dailymotion\.com/"
to="https://static1-ssl.dmcdn.net/" />
<rule from="^http://(s\.|www\.)?dmcloud\.net/"
to="https://$1dmcloud.net/" />
<rule from="^https?://support\.dmcloud\.net/"
to="https://dmcloud.zendesk.com/" />
</ruleset>

View File

@ -0,0 +1,53 @@
<!--
For problematic rules, see Deviantart-mismatches.xml.
Other deviantArt rulesets:
- Sta.sh.xml
ToDo: Find edgecast URL for /(fc|th)\d+.
Mixed content:
- Images on *.....com from e.deviantart.net *
* Secured by us
-->
<ruleset name="DeviantArt (pending)" default_off="site operator says not ready yet">
<target host="deviantart.com" />
<target host="*.deviantart.com" />
<target host="deviantart.net" />
<target host="*.deviantart.net" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.deviantart\.com$" name="^userinfo$" /-->
<securecookie host="^\.deviantart\.com$" name=".*" />
<!-- Redirects from com to net, but does so successfully by itself.
-->
<rule from="^http://([aei]|fc\d\d|s[ht]|th\d\d)\.deviantart\.(com|net)/"
to="https://$1.deviantart.$2/" />
<!-- This handles everything that isn't in the first rule.
Namely, usernames, backend, fc, th, and (www.).
These domains present a cert that is only
valid for .com.
Note that .net isn't used on DA, but.net does
redirect to .com, and we shouldn't break what would
otherwise work.
Mustn't rewrite from https here, as doing so
would conflict with the first rule.
-->
<rule from="^http://([^/:@\.]+\.)?deviantart\.(?:com|net)/"
to="https://$1deviantart.com/" />
</ruleset>

View File

@ -0,0 +1,38 @@
<!--
Problematic domains:
- www.dukgo.com (mismatched, CN: dukgo.com)
Fully covered domains:
- (www.)dukgo.com (www → ^)
-->
<ruleset name="DuckDuckGo">
<target host="duckduckgo.com" />
<target host="*.duckduckgo.com" />
<target host="ddg.gg" />
<target host="duck.co" />
<target host="i.duck.co" />
<target host="dukgo.com" />
<target host="www.dukgo.com" />
<exclusion pattern="^http://(help|meme)\.duckduckgo\.com/" />
<securecookie host="^duck\.co$" name=".*"/>
<rule from="^http://duckduckgo\.com/" to="https://duckduckgo.com/"/>
<rule from="^http://([^/:@\.]+)\.duckduckgo\.com/" to="https://$1.duckduckgo.com/"/>
<!-- TODO: What does ddg.gg/foo do? Runs query foo, redirects to homepage, or error? -->
<rule from="^http://ddg\.gg/$" to="https://duckduckgo.com/" />
<rule from="^http://duck\.co/" to="https://duck.co/" />
<rule from="^http://i\.duck\.co/"
to="https://duckduckgo.com/"/>
<rule from="^http://(?:www\.)?dukgo\.com/"
to="https://dukgo.com/" />
</ruleset>

View File

@ -0,0 +1,44 @@
<!--
For other Yahoo coverage, see Yahoo.xml.
These altnames don't exist:
- www.blog.flickr.net
- www.code.flickr.net
-->
<ruleset name="Flickr">
<target host="flic.kr" />
<target host="*.flic.kr" />
<target host="flickr.com" />
<target host="*.flickr.com" />
<target host="*.flickr.net" />
<target host="*.staticflickr.com" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.flic\.kr$" name="^BX$" /-->
<securecookie host="^\.flic\.kr$" name=".+" />
<securecookie host=".*\.flickr\.com$" name=".+" />
<rule from="^http://flic\.kr/"
to="https://flic.kr/" />
<rule from="^http://(api\.|www\.)?flickr\.com/"
to="https://$1flickr.com/" />
<rule from="^http://s(ecure|tatic)\.flickr\.com/"
to="https://s$1.flickr.com/" />
<rule from="^http://(c2|farm\d+)\.static(\.)?flickr\.com/"
to="https://$1.static$2flickr.com/" />
<rule from="^http://(blog|code)\.flickr\.net/"
to="https://$1.flickr.net/" />
</ruleset>

View File

@ -0,0 +1,11 @@
<!--
For other GitHub coverage, see Github.xml.
-->
<ruleset name="GitHub Pages">
<target host="*.github.io" />
<rule from="^http://([^/@:\.]+)\.github\.io/"
to="https://$1.github.io/" />
</ruleset>

View File

@ -0,0 +1,94 @@
<!--
Other GitHub rulesets:
- Github-Pages.xml
- Guag.es.xml
- Speaker_Deck.com.xml
CDN buckets:
- github-images.s3.amazonaws.com
- github.global.ssl.fastly.net
- a248.e.akamai.net/assets.github.com/
- a248.e.akamai.net/camo.github.com/
- s3.amazonaws.com/github/ | d24z2fz21y4fag.cloudfront.net
- github.myshopify.com
Fully covered domains:
- github.com subdomains:
- (www.)
- assets\d+
- assets-cdn
- bounty
- cloud
- f.cloud
- codeload
- developer
- eclipse
- enterprise
- gist
- gist-assets
- help
- identicons
- jobs
- mac
- mobile
- nodeload
- octodex
- pages
- raw
- rg3
- shop
- status
- support
- training
- try
- wiki
- windows
- collector.githubapp.com
- githubusercontent.com
-->
<ruleset name="GitHub">
<target host="github.com" />
<target host="*.github.com" />
<target host="github.io" />
<target host="*.githubusercontent.com" />
<target host="collector.githubapp.com" />
<!-- Secured by server:
-->
<!--securecookie host="^github\.com$" name="^(_gh_sess|tz|user_session)$" /-->
<!--securecookie host="^\.github\.com$" name="^(dotcom_user|logged_in)$" /-->
<!--securecookie host="^enterprise\.github\.com$" name="^(_enterprise_web|request_method)$" /-->
<!--securecookie host="^gist\.github\.com$" name="^_gist_session$" /-->
<!--securecookie host="^help\.github\.com$" name="^_help_session$" /-->
<!--
Not secured by server:
-->
<!--securecookie host="^status\.github\.com$" name="^rack\.session$" /-->
<securecookie host="^(?:.*\.)?github\.com$" name=".+" />
<rule from="^http://((?:assets\d+|assets-cdn|bounty|cloud|f\.cloud|codeload|developer|eclipse|enterprise|gist|gist-assets|help|identicons|jobs|mac|mobile|nodeload|octodex|pages|raw|rg3|shop|status|support|training|try|wiki|windows|www)\.)?github\.com/"
to="https://$1github.com/" />
<rule from="^http://collector\.githubapp\.com/"
to="https://collector.githubapp.com/" />
<rule from="^https?://github\.io/"
to="https://pages.github.com/" />
<rule from="^http://([^/@:\.]+)\.githubusercontent\.com/"
to="https://$1.githubusercontent.com/" />
</ruleset>

View File

@ -0,0 +1,26 @@
<!--
Problematic domains:
- (www.)apture.com (works, mismatched, CN: *.google.com)
-->
<ruleset name="Google (mismatches)" default_off="mismatches">
<!-- Akamai -->
<target host="js.admeld.com"/>
<target host="apture.com" />
<target host="www.apture.com" />
<target host="googleartproject.com"/>
<target host="www.googleartproject.com"/>
<rule from="^http://js\.admeld\.com/"
to="https://js.admeld.com/"/>
<rule from="^https?://(?:www\.)?apture\.com/"
to="https://apture.com/" />
<rule from="^http://(?:www\.)?googleartproject\.com/"
to="https://www.googleartproject.com/"/>
</ruleset>

View File

@ -0,0 +1,14 @@
<!--
For other Google coverage, see GoogleServices.xml.
-->
<ruleset name="Google.org">
<target host="google.org" />
<target host="www.google.org" />
<rule from="^http://(www\.)?google\.org/"
to="https://$1google.org/" />
</ruleset>

View File

@ -0,0 +1,143 @@
<!--
For other Google coverage, see GoogleServices.xml.
Nonfunctional domains:
- hosted.gmodules.com *
- img0.gmodules.com *
- p.gmodules.com *
* 404; mismatched, CN: *.googleusercontent.com
Problematic domains:
- gmodules.com (503, CN: www.google.com)
- www.gmodules.com (503, CN: *.googleusercontent.com)
- gstatic.com (404, valid cert)
- api.recaptcha.net (works; mismatched, CN: google.com)
Partially covered domains:
- (www.)gmodules.com (→ www.google.com)
- (www.)google.com
- chart.apis.google.com (→ chart.googleapis.com)
Fully covered domains:
- api.google.com
- *.clients.google.com:
- linkhelp
- ssl.google-analytics.com
- www.google-analytics.com
- googleapis.com subdomains:
- ajax
- chart
- *.commondatastorage
- fonts
- *.storage
- www
- gstatic.com subdomains:
- (www.) (^ → www)
- csi
- encrypted-tbn\d
- g0
- *.metric
- ssl
- t\d
- api.recaptcha.net (→ www.google.com)
- api-secure.recaptcha.net
- gdata.youtube.com
ssl.google-analytics.com/ga.js sets __utm\w wildcard
cookies on whichever domain it is loaded from.
-->
<ruleset name="Google APIs">
<target host="gmodules.com" />
<target host="www.gmodules.com" />
<target host="google.com" />
<target host="apis.google.com" />
<target host="*.apis.google.com" />
<target host="*.clients.google.com" />
<target host="www.google.com" />
<target host="*.google-analytics.com" />
<target host="*.googleapis.com" />
<target host="gstatic.com" />
<target host="*.gstatic.com" />
<!-- Captive portal detection redirects to this URL, and many captive
portals break TLS, so exempt this redirect URL.
See GitHub bug #368
-->
<exclusion pattern="^http://www\.gstatic\.com/generate_204" />
<target host="*.recaptcha.net" />
<target host="gdata.youtube.com" />
<exclusion pattern="^http://gdata\.youtube\.com/crossdomain\.xml" />
<securecookie host="^ssl\.google-analytics\.com$" name=".+" />
<rule from="^http://(?:www\.)?gmodules\.com/ig/images/"
to="https://www.google.com/ig/images/" />
<!-- jsapi was causing problems on some sites that embed google maps:
https://trac.torproject.org/projects/tor/ticket/2335
Apparently now fixed; thanks, Google!
-->
<rule from="^http://(?:www\.)?google\.com/(afsonline/|chart|jsapi|recaptcha/|uds)"
to="https://www.google.com/$1" />
<rule from="^http://(api|[\w-]+\.client)s\.google\.com/"
to="https://$1s.google.com/" />
<rule from="^http://chart\.apis\.google\.com/chart"
to="https://chart.googleapis.com/chart" />
<rule from="^http://(ssl|www)\.google-analytics\.com/"
to="https://$1.google-analytics.com/" />
<rule from="^http://(ajax|chart|fonts|www)\.googleapis\.com/"
to="https://$1.googleapis.com/" />
<rule from="^http://([^@:\./]+\.)?(commondata)?storage\.googleapis\.com/"
to="https://$1$2storage.googleapis.com/" />
<!-- There is an interesting question about whether we should
append &strip=1 to all cache URLs. This causes them to load
without images and styles, which is more secure but can look
worse.
Without &strip=1, the images and styles from the cached
pages still load from the original, typically unencrypted, page.
With &strip=1, the cached page will be text-only and
will come exclusively from Google's HTTPS server.
-->
<rule from="^http://(?:www\.)?gstatic\.com/"
to="https://www.gstatic.com/" />
<rule from="^http://(csi|encrypted-tbn\d|g0|[\w-]+\.metric|ssl|t\d)\.gstatic\.com/"
to="https://$1.gstatic.com/" />
<rule from="^http://api\.recaptcha\.net/"
to="https://www.google.com/recaptcha/api/" />
<rule from="^http://api-secure\.recaptcha\.net/"
to="https://api-secure.recaptcha.net/" />
<rule from="^http://gdata\.youtube\.com/"
to="https://gdata.youtube.com/" />
</ruleset>

View File

@ -0,0 +1,6 @@
<ruleset name="GoogleCanada">
<target host="google.ca" />
<target host="*.google.ca" />
<rule from="^http://([^/:@\.]+)\.google\.ca/finance" to="https://$1.google.ca/finance"/>
</ruleset>

View File

@ -0,0 +1,65 @@
<!--
For other Google coverage, see GoogleServices.xml.
Problematic domains:
- www.google.bo *
- www.google.co *
- www.google.ec *
- www.google.in *
- www.google.kr *
- www.google.com.kz **
- www.google.com.lk *
- www.google.mx **
- www.google.sg *
- www.google.sl *
- www.google.ug *
- www.google.vn *
* 404; mismatched, CN: google.com
** Works; mismatched, CN: google.com
-->
<ruleset name="Google Images">
<target host="google.*" />
<target host="www.google.*" />
<target host="google.co.*" />
<target host="www.google.co.*" />
<target host="google.com" />
<target host="images.google.com" />
<target host="google.com.*" />
<target host="www.google.com.*" />
<!--
Only handle image-related paths in this ruleset:
-->
<exclusion pattern="^http://(?:www\.)?google(?:\.com?)?\.\w{2,3}/(?!(?:advanced_image_search|imghp|.*tb(?:m=isch|s=sbi)))" />
<rule from="^http://(?:www\.)?google\.com/"
to="https://www.google.com/" />
<rule from="^http://images\.google\.com/"
to="https://images.google.com/" />
<!-- First handle problematic domains:
-->
<rule from="^http://(?:www\.)?google\.co/"
to="https://www.google.com/" />
<rule from="^http://(?:www\.)?google\.(?:co\.)?(in|kr|ug)/"
to="https://www.google.co.$1/" />
<rule from="^http://(?:www\.)?google\.(?:com\.)?(kz|lk)/"
to="https://www.google.$1/" />
<rule from="^http://(?:www\.)?google\.(?:com\.)?(bo|ec|mx|sg|sl|vn)/"
to="https://www.google.com.$1/" />
<!-- And then the rest:
-->
<rule from="^http://(?:www\.)?google\.(com?\.)?(ae|ar|at|au|bg|bh|br|ca|ch|cl|co|cr|cu|de|eg|es|fi|fr|gh|gt|hr|id|ie|il|it|jo|jp|jm|ke|kw|lb|ly|my|na|ng|nl|no|nz|om|pa|pe|pk|pl|pt|py|qa|ro|ru|rw|sa|se|sv|th|tr|uk|uy|ve|za|zw)/"
to="https://www.google.$1$2/" />
</ruleset>

View File

@ -0,0 +1,78 @@
<ruleset name="Search www.google.com">
<!--
Enabling this ruleset should cause searches to go to
https://www.google.com rather than https://encrypted.google.com. Note that
the filename is important; it must be before GoogleSearch.xml in a bash
expansion of src/chrome/content/rules/*.xml in order to take precedence.
-->
<target host="*.google.com" />
<target host="google.com" />
<target host="www.google.com.*" />
<target host="google.com.*" />
<target host="www.google.co.*" />
<target host="google.co.*" />
<target host="www.google.*" />
<target host="google.*" />
<!-- beyond clients1 these do not currently exist in the ccTLDs,
but just in case... -->
<target host="clients1.google.com.*" />
<target host="clients2.google.com.*" />
<target host="clients3.google.com.*" />
<target host="clients4.google.com.*" />
<target host="clients5.google.com.*" />
<target host="clients6.google.com.*" />
<target host="clients1.google.co.*" />
<target host="clients2.google.co.*" />
<target host="clients3.google.co.*" />
<target host="clients4.google.co.*" />
<target host="clients5.google.co.*" />
<target host="clients6.google.co.*" />
<target host="clients1.google.*" />
<target host="clients2.google.*" />
<target host="clients3.google.*" />
<target host="clients4.google.*" />
<target host="clients5.google.*" />
<target host="clients6.google.*" />
<rule from="^http://www\.google\.com/$"
to="https://www.google.com/"/>
<!-- The most basic case. -->
<rule from="^http://(?:www\.)?google\.com/search"
to="https://www.google.com/search"/>
<!-- A very annoying exception that we seem to need for the basic case -->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbs=shop" />
<exclusion pattern="^http://clients[0-9]\.google\.com/.*client=products.*" />
<exclusion pattern="^http://suggestqueries\.google\.com/.*client=.*" />
<!-- https://trac.torproject.org/projects/tor/ticket/9713 -->
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp" />
<!-- This is necessary for image results links from web search results -->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbm=isch.*" />
<rule from="^http://(?:www\.)?google\.com/webhp"
to="https://www.google.com/webhp"/>
<rule from="^http://(?:www\.)?google\.com/#"
to="https://www.google.com/#"/>
<rule from="^http://(?:www\.)?google\.com/$"
to="https://www.google.com/"/>
<!-- Completion urls look like this:
http://clients2.google.co.jp/complete/search?hl=ja&client=hp&expIds=17259,24660,24729,24745&q=m&cp=1 HTTP/1.1\r\n
-->
<rule from="^http://clients[0-9]\.google\.com/complete/search"
to="https://clients1.google.com/complete/search"/>
</ruleset>

View File

@ -0,0 +1,67 @@
<!--
Problematic domains:
- khms *
- khms[0-3] *
* $ 404s
Fully covered domains:
- google.com subdomains:
- khms
- khms[0-3]
-->
<ruleset name="Google Maps">
<target host="maps.google.*" />
<!--
https://trac.torproject.org/projects/tor/ticket/8627
-->
<exclusion pattern="^http://maps\.google\.com/local_url" />
<exclusion pattern="^http://maps\.google\.gr/transitathens" />
<target host="maps.google.co.*" />
<target host="khms.google.com" />
<target host="khms0.google.com" />
<target host="khms1.google.com" />
<target host="khms2.google.com" />
<target host="khms3.google.com" />
<target host="maps-api-ssl.google.com" />
<target host="mw2.google.com" />
<target host="maps.google.com.*" />
<target host="maps.googleapis.com" />
<!--
https://mail1.eff.org/pipermail/https-everywhere-rules/2012-September/001317.html
-->
<!--exclusion pattern="^http://maps\.googleapis\.com/map(files/lib/map_1_20\.swf|sapi/publicapi\?file=flashapi)" /-->
<exclusion pattern="^http://maps\.googleapis\.com/map(?:files/lib/map_\d+_\d+\.swf|sapi/publicapi\?file=flashapi)" />
<target host="maps.gstatic.com" />
<!--securecookie host="^maps\.google\.(com?\.)?(au|ca|gh|ie|in|jm|ke|lk|my|n[agz]|pk|rw|sl|sg|ug|uk|za|zw)$" name=".+" /-->
<securecookie host="^maps\.google\.[\w.]{2,6}$" name=".+" />
<securecookie host="^maps\.g(?:oogle|oogleapis|static)\.com$" name=".+" />
<securecookie host="^maps-api-ssl\.google\.com$" name=".+" />
<rule from="^http://maps\.google\.([^/]+)/"
to="https://maps.google.$1/" />
<!-- http://khms.../$ 404s:
-->
<rule from="^http://khms\d?\.google\.com/+\??$"
to="https://www.google.com/" />
<rule from="^http://(khms\d?|maps-api-ssl|mw2)\.google\.com/"
to="https://$1.google.com/" />
<rule from="^http://maps\.g(oogleapis|static)\.com/"
to="https://maps.g$1.com/" />
<rule from="^https://maps\.googleapis\.com/map(?=files/lib/map_\d+_\d+\.swf|sapi/publicapi\?file=flashapi)"
to="http://maps.googleapis.com/map" downgrade="1" />
</ruleset>

View File

@ -0,0 +1,6 @@
<ruleset name="GoogleMelange">
<target host="www.google-melange.com" />
<target host="google-melange.com" />
<rule from="^http://(www\.)?google-melange\.com/" to="https://www.google-melange.com/" />
</ruleset>

View File

@ -0,0 +1,135 @@
<ruleset name="Google Search">
<target host="google.com" />
<target host="*.google.com" />
<target host="google.com.*" />
<target host="www.google.com.*" />
<target host="google.co.*" />
<target host="www.google.co.*" />
<target host="google.*" />
<target host="www.google.*" />
<!--
Beyond clients1 these do not currently
exist in the ccTLDs, but just in case...
-->
<target host="clients1.google.com.*" />
<target host="clients2.google.com.*" />
<target host="clients3.google.com.*" />
<target host="clients4.google.com.*" />
<target host="clients5.google.com.*" />
<target host="clients6.google.com.*" />
<target host="clients1.google.co.*" />
<target host="clients2.google.co.*" />
<target host="clients3.google.co.*" />
<target host="clients4.google.co.*" />
<target host="clients5.google.co.*" />
<target host="clients6.google.co.*" />
<target host="clients1.google.*" />
<target host="clients2.google.*" />
<target host="clients3.google.*" />
<target host="clients4.google.*" />
<target host="clients5.google.*" />
<target host="clients6.google.*" />
<!-- Some Google pages can generate naive links back to the
unencrypted version of encrypted.google.com, which is
a 301 but theoretically vulnerable to SSL stripping.
-->
<rule from="^http://encrypted\.google\.com/"
to="https://encrypted.google.com/" />
<!-- The most basic case.
-->
<rule from="^http://(?:www\.)?google\.com/search"
to="https://encrypted.google.com/search" />
<!-- A very annoying exception that we
seem to need for the basic case
-->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbs=shop" />
<exclusion pattern="^http://clients\d\.google\.com/.*client=products.*" />
<exclusion pattern="^http://suggestqueries\.google\.com/.*client=.*" />
<!-- https://trac.torproject.org/projects/tor/ticket/9713
-->
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp" />
<!-- This is necessary for image results
links from web search results
-->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbm=isch.*" />
<rule from="^http://(?:www\.)?google\.com/about"
to="https://www.google.com/about" />
<!-- There are two distinct cases for these firefox searches -->
<rule from="^http://(?:www\.)?google(?:\.com?)?\.[a-z]{2}/firefox/?$"
to="https://encrypted.google.com/" />
<rule from="^http://(?:www\.)?google(?:\.com?)?\.[a-z]{2}/firefox"
to="https://encrypted.google.com/webhp" />
<rule from="^http://(?:www\.)?google\.com/webhp"
to="https://encrypted.google.com/webhp" />
<rule from="^http://codesearch\.google\.com/"
to="https://codesearch.google.com/" />
<rule from="^http://(?:www\.)?google\.com/codesearch"
to="https://www.google.com/codesearch" />
<rule from="^http://(?:www\.)?google\.com/#"
to="https://encrypted.google.com/#" />
<rule from="^http://(?:www\.)?google\.com/$"
to="https://encrypted.google.com/" />
<!-- Google supports IPv6 search, including
HTTPS with a valid certificate! -->
<rule from="^http://ipv6\.google\.com/"
to="https://ipv6.google.com/" />
<!-- most google international sites look like
"google.fr", some look like "google.co.jp",
and some crazy ones like "google.com.au" -->
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/(search\?|#)"
to="https://$1google$2.$3/$4" />
<!-- Language preference setting -->
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/setprefs"
to="https://$1google$2.$3/setprefs" />
<!-- Completion urls look like this:
http://clients2.google.co.jp/complete/search?hl=ja&client=hp&expIds=17259,24660,24729,24745&q=m&cp=1 HTTP/1.1\r\n
-->
<rule from="^http://clients\d\.google\.com/complete/search"
to="https://clients1.google.com/complete/search" />
<rule from="^http://clients\d\.google(\.com?\.[a-z]{2})/complete/search"
to="https://clients1.google.$1/complete/search" />
<rule from="^http://clients\d\.google\.([a-z]{2})/complete/search"
to="https://clients1.google.$1/complete/search" />
<rule from="^http://suggestqueries\.google\.com/complete/search"
to="https://clients1.google.com/complete/search" />
<rule from="^http://(www\.)?google\.(com?\.)?([a-z]{2})/(?:webhp)?$"
to="https://$1google.$2$3/" />
<!-- If there are URL parameters, keep them. -->
<rule from="^http://(www\.)?google\.(com?\.)?([a-z]{2})/(?:webhp)?\?"
to="https://$1google.$2$3/webhp?" />
<!-- teapot -->
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/teapot"
to="https://$1google$2.$3/teapot" />
</ruleset>

View File

@ -0,0 +1,345 @@
<!--
Other Google rulesets:
- 2mdn.net.xml
- Admeld.xml
- ChannelIntelligence.com.xml
- Doubleclick.net.xml
- FeedBurner.xml
- Google.org.xml
- GoogleAPIs.xml
- Google_App_Engine.xml
- GoogleImages.xml
- GoogleShopping.xml
- Ingress.xml
- Meebo.xml
- Orkut.xml
- Postini.xml
- WebM_Project.org.xml
Nonfunctional domains:
- feedproxy.google.com (404, valid cert)
- partnerpage.google.com *
- safebrowsing.clients.google.com (404, mismatched)
- (www.)googlesyndicatedsearch.com (404; mismatched, CN: google.com)
- buttons.googlesyndication.com *
* 404, valid cert
Nonfunctional google.com paths:
- analytics (redirects to http)
- imgres
- gadgets *
- hangouts (404)
- u/ (404)
* Redirects to http
Problematic domains:
- www.goo.gl (404; mismatched, CN: *.google.com)
- google.com subdomains:
- books (googlebooks/, images/, & intl/ 404, but works when rewritten to www)
- cbks0 ****
- earth *
- gg ($ 404s)
- knoll *
- scholar **
- trends *
- news.google.cctld **
- scholar.google.cctld **
- *-opensocial.googleusercontent.com ***
**** $ 404s
* 404, valid cert
** Redirects to http, valid cert
*** Breaks followers widget - https://trac.torproject.org/projects/tor/ticket/7294
Partially covered domains:
- google.cctld subdomains:
- scholar (→ www)
- google.com subdomains:
- (www.)
- cbks0 ($ 404s)
- gg ($ 404s)
- news (→ www)
- scholar (→ www)
- *.googleusercontent.com (*-opensocial excluded)
Fully covered domains:
- lh[3-6].ggpht.com
- (www.)goo.gl (www → ^)
- google.com subdomains:
- accounts
- adwords
- apis
- appengine
- books (→ encrypted)
- calendar
- checkout
- chrome
- clients[12]
- code
- *.corp
- developers
- dl
- docs
- docs\d
- \d.docs
- drive
- earth (→ www)
- encrypted
- encrypted-tbn[123]
- feedburner
- fiber
- finance
- glass
- groups
- health
- helpouts
- history
- hostedtalkgadget
- id
- investor
- knol
- knoll (→ knol)
- lh\d
- mail
- chatenabled.mail
- pack
- picasaweb
- pki
- play
- plus
- plusone
- productforums
- profiles
- safebrowsing-cache
- cert-test.sandbox
- plus.sandbox
- sb-ssl
- script
- security
- services
- servicessites
- sites
- spreadsheets
- spreadsheets\d
- support
- talk
- talkgadget
- tbn2 (→ encrypted-tbn2)
- tools
- trends (→ www)
- partner.googleadservices.com
- (www.)googlecode.com
- *.googlecode.com (per-project subdomains)
- googlesource.com
- *.googlesource.com
- pagead2.googlesyndication.com
- tpc.googlesyndication.com
- mail-attachment.googleusercontent.com
- webcache.googleusercontent.com
XXX: Needs more testing
-->
<ruleset name="Google Services">
<target host="*.ggpht.com" />
<target host="gmail.com" />
<target host="www.gmail.com" />
<target host="goo.gl" />
<target host="www.goo.gl" />
<target host="google.*" />
<target host="accounts.google.*" />
<target host="adwords.google.*" />
<target host="finance.google.*" />
<target host="groups.google.*" />
<target host="it.google.*" />
<target host="news.google.*" />
<exclusion pattern="^http://(?:news\.)?google\.com/(?:archivesearch|newspapers)" />
<target host="picasaweb.google.*" />
<target host="scholar.google.*" />
<target host="www.google.*" />
<target host="*.google.ca" />
<target host="google.co.*" />
<target host="accounts.google.co.*" />
<target host="adwords.google.co.*" />
<target host="finance.google.co.*" />
<target host="groups.google.co.*" />
<target host="id.google.co.*" />
<target host="news.google.co.*" />
<target host="picasaweb.google.co.*" />
<target host="scholar.google.co.*" />
<target host="www.google.co.*" />
<target host="google.com" />
<target host="*.google.com" />
<exclusion pattern="^http://(?:www\.)?google\.com/analytics/*(?:/[^/]+)?(?:\?.*)?$" />
<!--exclusion pattern="^http://books\.google\.com/(?!books/(\w+\.js|css/|javascript/)|favicon\.ico|googlebooks/|images/|intl/)" /-->
<exclusion pattern="^http://cbks0\.google\.com/(?:$|\?)" />
<exclusion pattern="^http://gg\.google\.com/(?!csi(?:$|\?))" />
<target host="google.com.*" />
<target host="accounts.google.com.*" />
<target host="adwords.google.com.*" />
<target host="groups.google.com.*" />
<target host="id.google.com.*" />
<target host="news.google.com.*" />
<target host="picasaweb.google.com.*" />
<target host="scholar.google.com.*" />
<target host="www.google.com.*" />
<target host="partner.googleadservices.com" />
<target host="googlecode.com" />
<target host="*.googlecode.com" />
<target host="googlemail.com" />
<target host="www.googlemail.com" />
<target host="googlesource.com" />
<target host="*.googlesource.com" />
<target host="*.googlesyndication.com" />
<target host="www.googletagservices.com" />
<target host="googleusercontent.com" />
<target host="*.googleusercontent.com" />
<!--
Necessary for the Followers widget:
https://trac.torproject.org/projects/tor/ticket/7294
-->
<exclusion pattern="http://[^@:\./]+-opensocial\.googleusercontent\.com" />
<!-- Can we secure any of these wildcard cookies safely?
-->
<!--securecookie host="^\.google\.com$" name="^(hl|I4SUserLocale|NID|PREF|S)$" /-->
<!--securecookie host="^\.google\.[\w.]{2,6}$" name="^(hl|I4SUserLocale|NID|PREF|S|S_awfe)$" /-->
<securecookie host="^(?:accounts|adwords|\.code|login\.corp|developers|docs|\d\.docs|fiber|mail|picasaweb|plus|\.?productforums|support)\.google\.[\w.]{2,6}$" name=".+" />
<securecookie host="^www\.google\.com$" name="^GoogleAccountsLocale_session$" />
<securecookie host="^mail-attachment\.googleusercontent\.com$" name=".+" />
<securecookie host="^gmail\.com$" name=".+" />
<securecookie host="^www\.gmail\.com$" name=".+" />
<securecookie host="^googlemail\.com$" name=".+" />
<securecookie host="^www\.googlemail\.com$" name=".+" />
<!-- - lh 3-6 exist
- All appear identical
- Identical to lh\d.googleusercontent.com
-->
<rule from="^http://lh(\d)\.ggpht\.com/"
to="https://lh$1.ggpht.com/" />
<rule from="^http://lh(\d)\.google\.ca/"
to="https://lh$1.google.ca/" />
<rule from="^http://(www\.)?g(oogle)?mail\.com/"
to="https://$1g$2mail.com/" />
<rule from="^http://(?:www\.)?goo\.gl/"
to="https://goo.gl/" />
<!-- Redirects to http when rewritten to www:
-->
<rule from="^http://books\.google\.com/"
to="https://encrypted.google.com/" />
<!-- tisp$ 404s:
-->
<rule from="^http://(?:www\.)?google\.((?:com?\.)?\w{2,3})/tisp(?=$|\?)"
to="https://www.google.$1/tisp/" />
<!-- Paths that work on all in google.*
-->
<rule from="^http://(?:www\.)?google\.((?:com?\.)?\w{2,3})/(accounts|adplanner|ads|adsense|adwords|analytics|bookmarks|chrome|contacts|coop|cse|css|culturalinstitute|doodles|earth|favicon\.ico|finance|get|goodtoknow|googleblogs|grants|green|hostednews|images|intl|js|landing|logos|mapmaker|newproducts|news|nexus|patents|policies|prdhp|profiles|products|reader|s2|settings|shopping|support|tisp|tools|transparencyreport|trends|urchin|webmasters)(?=$|[?/])"
to="https://www.google.$1/$2" />
<!-- Paths that 404 on .ccltd, but work on .com:
-->
<rule from="^http://(?:www\.)?google\.(?:com?\.)?\w{2,3}/(?=calendar|dictionary|doubleclick|help|ideas|pacman|postini|powermeter|url)"
to="https://www.google.com/" />
<rule from="^http://(?:www\.)?google\.(?:com?\.)?\w{2,3}/custom"
to="https://www.google.com/cse" />
<!-- Paths that only exist/work on .com
-->
<rule from="^http://(?:www\.)?google\.com/(\+|appsstatus|books|buzz|extern_js|glass|googlebooks|ig|insights|moderator|phone|safebrowsing|videotargetting|webfonts)(?=$|[?/])"
to="https://www.google.com/$1" />
<!-- Subdomains that work on all in google.*
-->
<rule from="^http://(accounts|adwords|finance|groups|id|picasaweb|)\.google\.((?:com?\.)?\w{2,3})/"
to="https://$1.google.$2/" />
<!-- Subdomains that only exist/work on .com
-->
<rule from="^http://(apis|appengine|books|calendar|cbks0|chat|checkout|chrome|clients[12]|code|[\w-]+\.corp|developers|dl|docs\d?|\d\.docs|drive|encrypted|encrypted-tbn[123]|feedburner|fiber|fonts|gg|glass||health|helpouts|history|(?:hosted)?talkgadget|investor|lh\d|(?:chatenabled\.)?mail|pack|pki|play|plus(?:\.sandbox)?|plusone|productforums|profiles|safebrowsing-cache|cert-test\.sandbox|sb-ssl|script|security|services|servicessites|sites|spreadsheets\d?|support|talk|tools)\.google\.com/"
to="https://$1.google.com/" />
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp"/>
<rule from="^http://earth\.google\.com/"
to="https://www.google.com/earth/" />
<rule from="^http://scholar\.google\.((?:com?\.)?\w{2,3})/intl/"
to="https://www.google.$1/intl/" />
<rule from="^http://(?:encrypted-)?tbn2\.google\.com/"
to="https://encrypted-tbn2.google.com/" />
<rule from="^http://knoll?\.google\.com/"
to="https://knol.google.com/" />
<rule from="^http://news\.google\.(?:com?\.)?\w{2,3}/(?:$|news|newshp)"
to="https://www.google.com/news" />
<rule from="^http://trends\.google\.com/"
to="https://www.google.com/trends" />
<rule from="^http://([^/:@\.]+\.)?googlecode\.com/"
to="https://$1googlecode.com/" />
<rule from="^http://([^\./]\.)?googlesource\.com/"
to="https://$1googlesource.com/" />
<rule from="^http://partner\.googleadservices\.com/"
to="https://partner.googleadservices.com/" />
<rule from="^http://(pagead2|tpc)\.googlesyndication\.com/"
to="https://$1.googlesyndication.com/" />
<!-- !www doesn't exist.
-->
<rule from="^http://www\.googletagservices\.com/tag/js/"
to="https://www.googletagservices.com/tag/js/" />
<rule from="^http://([^@:\./]+)\.googleusercontent\.com/"
to="https://$1.googleusercontent.com/" />
</ruleset>

View File

@ -0,0 +1,28 @@
<!--
For other Google coverage, see GoogleServices.xml.
-->
<ruleset name="Google Shopping">
<target host="google.*" />
<target host="www.google.*" />
<target host="google.co.*" />
<target host="www.google.co.*" />
<target host="*.google.com" />
<target host="google.com.*" />
<target host="www.google.com.*" />
<rule from="^http://encrypted\.google\.com/(prdhp|shopping)"
to="https://www.google.com/$1" />
<rule from="^http://shopping\.google\.com/"
to="https://shopping.google.com/" />
<rule from="^http://(?:encrypted|www)\.google\.com/(.*tbm=shop)"
to="https://www.google.com/$1" />
<rule from="^http://(?:www\.)?google\.((?:com?\.)?(?:ae|ar|at|au|bg|bh|bo|br|ca|ch|cl|cr|co|cu|de|ec|eg|es|fi|fr|gh|gt|hr|id|ie|il|in|it|jm|jo|jp|ke|kr|kw|kz|lb|lk|ly|mx|my|na|ng|nl|no|nz|om|pa|pe|pk|pl|pt|py|qa|ro|ru|rw|sa|sg|sl|se|sv|th|tr|ug|uk|uy|ve|vn|za|zw))/(?=prdhp|shopping)"
to="https://www.google.com/$1" />
</ruleset>

View File

@ -0,0 +1,7 @@
<ruleset name="GoogleSorry">
<target host="sorry.google.com" />
<target host="www.google.com" />
<target host="google.com" />
<rule from="^http://((sorry|www)\.)?google\.com/sorry/" to="https://sorry.google.com/sorry/" />
</ruleset>

View File

@ -0,0 +1,8 @@
<ruleset name="Google Translate (broken)" default_off="redirect loops">
<target host="translate.googleapis.com" />
<target host="translate.google.com" />
<rule from="^http://translate\.googleapis\.com/" to="https://translate.googleapis.com/"/>
<rule from="^http://translate\.google\.com/"
to="https://translate.google.com/" />
</ruleset>

View File

@ -0,0 +1,83 @@
<ruleset name="Google Videos">
<target host="*.google.com" />
<target host="google.com" />
<target host="www.google.com.*" />
<target host="google.com.*" />
<target host="www.google.co.*" />
<target host="google.co.*" />
<target host="www.google.*" />
<target host="google.*" />
<rule from="^http://encrypted\.google\.com/videohp"
to="https://encrypted.google.com/videohp" />
<!-- https://videos.google.com is currently broken; work around that... -->
<rule from="^https?://videos?\.google\.com/$"
to="https://encrypted.google.com/videohp" />
<rule from="^http://(?:www\.)?google\.com/videohp"
to="https://encrypted.google.com/videohp" />
<rule from="^http://(?:images|www|encrypted)\.google\.com/(.*tbm=isch)"
to="https://encrypted.google.com/$1" />
<rule
from="^http://(?:www\.)?google\.(?:com?\.)?(?:au|ca|gh|ie|in|jm|ke|lk|my|na|ng|nz|pk|rw|sl|sg|ug|uk|za|zw)/videohp"
to="https://encrypted.google.com/videohp" />
<rule
from="^http://(?:www\.)?google\.(?:com?\.)?(?:ar|bo|cl|co|cu|cr|ec|es|gt|mx|pa|pe|py|sv|uy|ve)/videohp$"
to="https://encrypted.google.com/videohp?hl=es" />
<rule
from="^http://(?:www\.)?google\.(?:com\.)?(?:ae|bh|eg|jo|kw|lb|ly|om|qa|sa)/videohp$"
to="https://encrypted.google.com/videohp?hl=ar" />
<rule from="^http://(?:www\.)?google\.(?:at|ch|de)/videohp$"
to="https://encrypted.google.com/videohp?hl=de" />
<rule from="^http://(?:www\.)?google\.(fr|nl|it|pl|ru|bg|pt|ro|hr|fi|no)/videohp$"
to="https://encrypted.google.com/videohp?hl=$1" />
<rule from="^http://(?:www\.)?google\.com?\.(id|th|tr)/videohp$"
to="https://encrypted.google.com/videohp?hl=$1" />
<rule from="^http://(?:www\.)?google\.com\.il/videohp$"
to="https://encrypted.google.com/videohp?hl=he" />
<rule from="^http://(?:www\.)?google\.com\.kr/videohp$"
to="https://encrypted.google.com/videohp?hl=ko" />
<rule from="^http://(?:www\.)?google\.com\.kz/videohp$"
to="https://encrypted.google.com/videohp?hl=kk" />
<rule from="^http://(?:www\.)?google\.com\.jp/videohp$"
to="https://encrypted.google.com/videohp?hl=ja" />
<rule from="^http://(?:www\.)?google\.com\.vn/videohp$"
to="https://encrypted.google.com/videohp?hl=vi" />
<rule from="^http://(?:www\.)?google\.com\.br/videohp$"
to="https://encrypted.google.com/videohp?hl=pt-BR" />
<rule from="^http://(?:www\.)?google\.se/videohp$"
to="https://encrypted.google.com/videohp?hl=sv" />
<!-- If there are URL parameters, keep them. -->
<rule
from="^http://(?:www\.)?google\.(?:com?\.)?(?:ar|bo|cl|co|cu|cr|ec|es|gt|mx|pa|pe|py|sv|uy|ve)/videohp\?"
to="https://encrypted.google.com/videohp?hl=es&#38;" />
<rule
from="^http://(?:www\.)?google\.(?:com\.)?(?:ae|bh|eg|jo|kw|lb|ly|om|qa|sa)/videohp\?"
to="https://encrypted.google.com/videohp?hl=ar&#38;" />
<rule from="^http://(?:www\.)?google\.(?:at|ch|de)/videohp\?"
to="https://encrypted.google.com/videohp?hl=de&#38;" />
<rule from="^http://(?:www\.)?google\.(fr|nl|it|pl|ru|bg|pt|ro|hr|fi|no)/videohp\?"
to="https://encrypted.google.com/videohp?hl=$1&#38;" />
<rule from="^http://(?:www\.)?google\.com?\.(id|th|tr)/videohp\?"
to="https://encrypted.google.com/videohp?hl=$1&#38;" />
<rule from="^http://(?:www\.)?google\.com\.il/videohp\?"
to="https://encrypted.google.com/videohp?hl=he&#38;" />
<rule from="^http://(?:www\.)?google\.com\.kr/videohp\?"
to="https://encrypted.google.com/videohp?hl=ko&#38;" />
<rule from="^http://(?:www\.)?google\.com\.kz/videohp\?"
to="https://encrypted.google.com/videohp?hl=kk&#38;" />
<rule from="^http://(?:www\.)?google\.com\.jp/videohp\?"
to="https://encrypted.google.com/videohp?hl=ja&#38;" />
<rule from="^http://(?:www\.)?google\.com\.vn/videohp\?"
to="https://encrypted.google.com/videohp?hl=vi&#38;" />
<rule from="^http://(?:www\.)?google\.com\.br/videohp\?"
to="https://encrypted.google.com/videohp?hl=pt-BR&#38;" />
<rule from="^http://(?:www\.)?google\.se/videohp\?"
to="https://encrypted.google.com/videohp?hl=sv&#38;" />
<rule from="^http://video\.google\.com/ThumbnailServer2"
to="https://video.google.com/ThumbnailServer2" />
</ruleset>

View File

@ -0,0 +1,17 @@
<!--
gwbhrd.appspot.com
-->
<ruleset name="GoogleWatchBlog">
<target host="googlewatchblog.de" />
<target host="*.googlewatchblog.de" />
<securecookie host="^(?:www)?\.googlewatchblog\.de$" name=".+" />
<rule from="^http://(static\.|www\.)?googlewatchblog\.de/"
to="https://$1googlewatchblog.de/" />
</ruleset>

View File

@ -0,0 +1,21 @@
<!--
For other Google coverage, see GoogleServices.xml.
-->
<ruleset name="Google App Engine">
<target host="appspot.com" />
<target host="*.appspot.com" />
<!--
Redirects to http for some reason.
-->
<exclusion pattern="^http://photomunchers\.appspot\.com/" />
<securecookie host="^.+\.appspot\.com$" name=".+" />
<rule from="^http://([^@:\./]+\.)?appspot\.com/"
to="https://$1appspot.com/" />
</ruleset>

View File

@ -0,0 +1,16 @@
<!-- This rule was automatically generated based on an HSTS
preload rule in the Chromium browser. See
https://src.chromium.org/viewvc/chrome/trunk/src/net/base/transport_security_state.cc
for the list of preloads. Sites are added to the Chromium HSTS
preload list on request from their administrators, so HTTPS should
work properly everywhere on this site.
Because Chromium and derived browsers automatically force HTTPS for
every access to this site, this rule applies only to Firefox. -->
<ruleset name="Googleplex.com (default off)" platform="firefox" default_off="Certificate error">
<target host="googleplex.com" />
<securecookie host="^googleplex\.com$" name=".+" />
<rule from="^http://googleplex\.com/" to="https://googleplex.com/" />
</ruleset>

View File

@ -0,0 +1,15 @@
<ruleset name="OpenStreetMap">
<target host="openstreetmap.org"/>
<target host="*.openstreetmap.org"/>
<rule from="^http://(?:www\.)?openstreetmap\.org/"
to="https://www.openstreetmap.org/"/>
<rule from="^http://tile\.openstreetmap\.org/"
to="https://a.tile.openstreetmap.org/"/>
<rule from="^http://(blog|help|lists|nominatim|piwik|taginfo|[abc]\.tile|trac|wiki)\.openstreetmap\.org/"
to="https://$1.openstreetmap.org/"/>
</ruleset>

View File

@ -0,0 +1,14 @@
<!--
www: cert only matches ^rawgithub.com
-->
<ruleset name="rawgithub.com">
<target host="rawgithub.com" />
<target host="www.rawgithub.com" />
<rule from="^http://(?:www\.)?rawgithub\.com/"
to="https://rawgithub.com/" />
</ruleset>

View File

@ -0,0 +1,101 @@
<!--
CDN buckets:
- akmedia-a.akamaihd.net
- soundcloud.assistly.com
- help.soundcloud.com
- cs70.wac.edgecastcdn.net
- a1.sndcdn.com
- i1.sndcdn.com
- w1.sndcdn.com
- wpc.658D.edgecastcdn.net
- m-a.sndcdn.com.edgesuite.net
- soundcloud.gettyimages.com
- scbackstage.wpengine.netdna-cdn.com
- ssl doesn't exist
- backstage.soundcloud.com
- soundcloud.wpengine.netdna-cdn.com
- -ssl doesn't exist
- blog.soundcloud.com
- gs1.wpc.v2cdn.netcdn.net
- gs1.wpc.v2cdn.net
- ec-media.soundcloud.com
Nonfunctional soundcloud.com subdomains:
- help (redirects to http, mismatched, CN: *.assistly.com)
- m (redirects to http)
- media
- status (times out)
Problematic domains:
- m-a.sndcdn.com (works, akamai)
Partially covered domains:
- backstage.soundcloud.com
Fully covered domains:
- sndcdn.com subdomains:
- a[12]
- api
- i[1-4]
- w[12]
- wis
- soundcloud.com subdomains:
- (www.)
- api
- blog
- connect
- developers
- ec-media
- eventlogger
- help-assets
- media
- visuals
- w
-->
<ruleset name="Soundcloud (partial)">
<target host="scbackstage.wpengine.netdna-cdn.com" />
<target host="soundcloud.wpengine.netdna-cdn.com" />
<target host="*.sndcdn.com" />
<target host="soundcloud.com" />
<target host="*.soundcloud.com" />
<exclusion pattern="^https?://(?:scbackstage\.wpengine\.netdna-cdn|backstage\.soundcloud)\.com/(?!wp-content/)" />
<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
to="https://$1.sndcdn.com/" />
<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/"
to="https://$1soundcloud.com/" />
<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
to="https://backstage.soundcloud.com/" />
<rule from="^https?://soundcloud\.wpengine\.netdna-cdn\.com/"
to="https://blog.soundcloud.com/" />
</ruleset>

View File

@ -0,0 +1,36 @@
<!--
Nonfunctional:
- image.bayimg.com
- (www.)thepiratebay.sx (http reply)
For problematic rules, see ThePirateBay-mismatches.xml.
-->
<ruleset name="The Pirate Bay (partial)">
<target host="suprbay.org" />
<target host="*.suprbay.org" />
<!-- * for cross-domain cookie -->
<target host="*.forum.suprbay.org" />
<target host="thepiratebay.org"/>
<target host="*.thepiratebay.org"/>
<target host="thepiratebay.se"/>
<target host="*.thepiratebay.se"/>
<securecookie host="^.*\.suprbay\.org$" name=".*" />
<securecookie host="^(.*\.)?thepiratebay\.se$" name=".*"/>
<!-- Cert doesn't match (www.), redirects like so. -->
<rule from="^https?://(?:forum\.|www\.)?suprbay\.org/"
to="https://forum.suprbay.org/" />
<rule from="^http://(?:www\.)?thepiratebay\.(?:org|se)/"
to="https://thepiratebay.se/"/>
<rule from="^http://(rss|static|torrents)\.thepiratebay\.(?:org|se)/"
to="https://$1.thepiratebay.se/"/>
</ruleset>

View File

@ -0,0 +1,18 @@
<ruleset name="Tor Project">
<target host="torproject.org" />
<target host="*.torproject.org" />
<exclusion pattern="^http://torperf\.torproject\.org/" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.blog\.torproject\.org$" name="^SESS[0-9a-f]{32}$" /-->
<securecookie host="^(?:.*\.)?torproject\.org$" name=".+" />
<rule from="^http://([^/:@\.]+\.)?torproject\.org/"
to="https://$1torproject.org/" />
</ruleset>

View File

@ -0,0 +1,169 @@
<!--
Other Twitter rulesets:
- Twitter_Community.com.xml
Nonfunctional domains:
- status.twitter.com *
- status.twitter.jp *
* Tumblr
CDN buckets:
- a1095.g.akamai.net/=/1095/134446/1d/platform.twitter.com/ | platform2.twitter.com.edgesuite.net
- platform2.twitter.com
- twitter-any.s3.amazonaws.com
- twitter-blog.s3.amazonaws.com
- d2rdfnizen5apl.cloudfront.net
- s.twimg.com
- ssl2.twitter.com.edgekey.net
- twitter.github.com
Problematic domains:
- twimg.com subdomains:
- a5 *
- s (cloudfront)
- twitter.com subdomains:
- platform[0-3] (403, akamai)
* akamai
Fully covered domains:
- (www.)t.co (www → ^)
- twimg.com subdomains:
- a[5-9] (→ si0)
- a\d
- abs
- dnt
- ea
- g
- g2
- gu
- hca
- jp
- ma
- ma[0123]
- o
- p
- pbs
- r
- s (→ d2rdfnizen5apl.cloudfront.net)
- si[0-5]
- syndication
- cdn.syndication
- tailfeather
- ton
- v
- widgets
- twitter.com subdomains:
- (www.)
- 201[012]
- about
- ads
- analytics
- api
- cdn.api
- urls.api
- blog
- business
- preview.cdn
- preview-dev.cdn
- preview-stage.cdn
- de
- dev
- en
- engineering
- es
- firefox
- fr
- it
- ja
- jp
- m
- media
- mobile
- music
- oauth
- p
- pic
- platform
- platform[0-3] (→ platform)
- widgets.platform
- search
- static
- support
- transparency
- upload
These altnames don't exist:
- i3.twimg.com
- p-dev.twimg.com
- vmtc.twimg.com
- cdn-dev.api.twitter.com
-->
<ruleset name="Twitter">
<target host="t.co" />
<target host="*.t.co" />
<target host="*.twimg.com" />
<target host="twitter.com" />
<target host="*.twitter.com" />
<!-- Secured by server:
-->
<!--securecookie host="^\.twitter\.com$" name="^_twitter_sess$" /-->
<!--securecookie host="^support\.twitter\.com$" name="^_help_center_session$" /-->
<!--
Not secured by server:
-->
<!--securecookie host="^\.t\.co$" name="^muc$" /-->
<!--securecookie host="^\.twitter\.com$" name="^guest_id$" /-->
<securecookie host="^\.t\.co$" name=".+" />
<securecookie host="^(?:.*\.)?twitter\.com$" name=".+" />
<rule from="^http://(?:www\.)?t\.co/"
to="https://t.co/" />
<rule from="^http://a[5-9]\.twimg\.com/"
to="https://si0.twimg.com/" />
<rule from="^http://(abs|a\d|dnt|ea|g[2u]?|hca|jp|ma\d?|o|p|pbs|r|si\d|(?:cdn\.)?syndication|tailfeather|ton|v|widgets)\.twimg\.com/"
to="https://$1.twimg.com/" />
<rule from="^http://s\.twimg\.com/"
to="https://d2rdfnizen5apl.cloudfront.net/" />
<rule from="^http://((?:201\d|about|ads|analytics|blog|(?:cdn\.|urls\.)?api|business|preview(?:-dev|-stage)?\.cdn|de|dev|engineering|en|es|firefox|fr|it|ja|jp|m|media|mobile|music|oauth|p|pic|platform|widgets\.platform|search|static|support|transparency|upload|www)\.)?twitter\.com/"
to="https://$1twitter.com/" />
<rule from="^http://platform\d\.twitter\.com/"
to="https://platform.twitter.com/" />
</ruleset>

View File

@ -0,0 +1,75 @@
<!--
CDN buckets:
- av.vimeo.com.edgesuite.net
- a808.g.akamai.net
- pdl.vimeocdn.com.edgesuite.net
- a1189.g.akamai.net
Problematic subdomains:
- av (pdl.../crossdomain.xml restricts to port 80)
- pdl (works, akamai)
Partially covered subdomains:
- developer (some pages redirect to http)
- pdl (→ akamai)
Fully covered subdomains:
- (www.)
- secure
Default off per https://trac.torproject.org/projects/tor/ticket/7569 -->
<ruleset name="Vimeo (default off)" default_off="breaks some video embedding">
<target host="vimeo.com" />
<target host="*.vimeo.com" />
<exclusion pattern="^http://av\.vimeo\.com/crossdomain\.xml" />
<!--exclusion pattern="^http://developer\.vimeo\.com/($|\?|(apps|guidelines|help|player)($|[?/]))" /-->
<exclusion pattern="^http://developer\.vimeo\.com/(?!apis(?:$|[?/])|favicon\.ico)" />
<target host="*.vimeocdn.com" />
<!--
Uses crossdomain.xml from s3.amazonaws.com, which sets secure="false"
https://mail1.eff.org/pipermail/https-everywhere/2012-October/001583.html
-->
<exclusion pattern="^http://a\.vimeocdn\.com/p/flash/moogaloop/" />
<!-- We cannot secure streams because crossdomain.xml
restricts to port 80 :(
-->
<exclusion pattern="^http://pdl\.vimeocdn\.com/(?!crossdomain\.xml)" />
<!-- Tracking cookies:
-->
<securecookie host="^\.(?:player\.)?vimeo\.com$" name="^__utm\w$" />
<rule from="^http://((?:developer|player|secure|www)\.)?vimeo\.com/"
to="https://$1vimeo.com/" />
<rule from="^http://av\.vimeo\.com/"
to="https://a248.e.akamai.net/f/808/9207/8m/av.vimeo.com/" />
<!-- a & b: Akamai -->
<rule from="^http://(?:secure-)?([ab])\.vimeocdn\.com/"
to="https://secure-$1.vimeocdn.com/" />
<rule from="^http://i\.vimeocdn\.com/"
to="https://i.vimeocdn.com/" />
<rule from="^http://pdl\.vimeocdn\.com/"
to="https://a248.e.akamai.net/f/1189/4415/8d/pdl.vimeocdn.com/" />
</ruleset>

View File

@ -0,0 +1,13 @@
<ruleset name="WikiLeaks">
<target host="wikileaks.org" />
<target host="*.wikileaks.org" />
<securecookie host="^(?:w*\.)?wikileaks\.org$" name=".+" />
<rule from="^http://((?:chat|search|shop|www)\.)?wikileaks\.org/"
to="https://$1wikileaks.org/" />
</ruleset>

View File

@ -0,0 +1,107 @@
<!--
Wikipedia and other Wikimedia Foundation wikis previously had no real HTTPS support, and
URLs had to be rewritten to https://secure.wikimedia.org/$wikitype/$language/ . This is no
longer the case, see https://blog.wikimedia.org/2011/10/03/native-https-support-enabled-for-all-wikimedia-foundation-wikis/ ,
so this file is a lot simpler these days.
Mixed content:
- Images, on:
- stats.wikimedia.org from upload.wikimedia.org *
- stats.wikimedia.org from wikimediafoundation.org *
* Secured by us
-->
<ruleset name="Wikimedia">
<target host="enwp.org" />
<target host="frwp.org" />
<target host="mediawiki.org" />
<target host="www.mediawiki.org" />
<target host="wikimedia.org" />
<target host="*.wikimedia.org" />
<exclusion pattern="^http://(?:apt|cs|cz|parsoid-lb\.eqiad|status|torrus|ubuntu)\.wikimedia\.org" />
<!-- https://mail1.eff.org/pipermail/https-everywhere-rules/2012-June/001189.html -->
<exclusion pattern="^http://lists\.wikimedia\.org/pipermail(?:$|/)" />
<target host="wikimediafoundation.org" />
<target host="www.wikimediafoundation.org" />
<!-- Wikimedia projects (also some wikimedia.org subdomains) -->
<target host="wikibooks.org" />
<target host="*.wikibooks.org" />
<target host="wikidata.org" />
<target host="*.wikidata.org" />
<target host="wikinews.org" />
<target host="*.wikinews.org" />
<target host="wikipedia.org" />
<target host="*.wikipedia.org" />
<target host="wikiquote.org" />
<target host="*.wikiquote.org" />
<target host="wikisource.org" />
<target host="*.wikisource.org" />
<target host="wikiversity.org" />
<target host="*.wikiversity.org" />
<target host="wikivoyage.org" />
<target host="*.wikivoyage.org" />
<target host="wiktionary.org" />
<target host="*.wiktionary.org" />
<!-- Wikimedia chapters -->
<target host="wikimedia.ca" />
<target host="www.wikimedia.ca" />
<!-- Wikimedia Tool Labs -->
<target host="tools.wmflabs.org" />
<target host="icinga.wmflabs.org" />
<target host="ganglia.wmflabs.org" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.wiki(books|ipedia)\.org$" name="^GeoIP$" /-->
<securecookie host="^^\.wik(?:ibooks|idata|imedia|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org$" name="^GeoIP$" />
<securecookie host="^([^@:/]+\.)?wik(ibooks|idata|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org$" name=".*" />
<securecookie host="^(species|commons|meta|incubator|wikitech).wikimedia.org$" name=".*" />
<securecookie host="^(?:www\.)?mediawiki\.org$" name=".*" />
<securecookie host="^wikimediafoundation.org$" name=".*" />
<rule from="^http://(en|fr)wp\.org/"
to="https://$1.wikipedia.org/wiki/" />
<rule from="^http://(?:www\.)?mediawiki\.org/"
to="https://www.mediawiki.org/" />
<rule from="^https?://download\.wikipedia\.org/"
to="https://dumps.wikimedia.org/" />
<rule from="^https?://(download|dataset2|sitemap)\.wikimedia\.org/"
to="https://dumps.wikimedia.org/" />
<rule from="^https?://(labs-ns[01]|virt0)\.wikimedia\.org/"
to="https://wikitech.wikimedia.org/" />
<rule from="^https?://noboard\.chapters\.wikimedia\.org/"
to="https://noboard-chapters.wikimedia.org/" />
<rule from="^https?://wg\.en\.wikipedia\.org/"
to="https://wg-en.wikipedia.org/" />
<rule from="^https?://arbcom\.(de|en|fi|nl)\.wikipedia\.org/"
to="https://arbcom-$1.wikipedia.org/" />
<rule from="^http://([^@:/]+\.)?wik(ibooks|idata|imedia|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org/"
to="https://$1wik$2.org/" />
<rule from="^http://(www\.)?wikimediafoundation\.org/"
to="https://$1wikimediafoundation.org/" />
<rule from="^http://(www\.)?wikimedia\.ca/"
to="https://wikimedia.ca/" />
<rule from="^http://([^@:/]+)\.wmflabs\.org/"
to="https://$1.wmflabs.org/" />
</ruleset>

2450
searx/https_rules/Yahoo.xml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
<ruleset name="YouTube (partial)">
<target host="youtube.com" />
<target host="*.youtube.com" />
<exclusion pattern="^http://(?:www\.)?youtube\.com/crossdomain\.xml"/>
<exclusion pattern="^http://(?:www\.)?youtube\.com/(?:apiplayer|api_video_info)"/>
<exclusion pattern="^http://(?:[^/@:\.]+\.)?ytimg\.com/.*apiplayer[0-9]*\.swf"/>
<target host="*.ytimg.com" />
<target host="youtu.be" />
<target host="youtube-nocookie.com"/>
<target host="www.youtube-nocookie.com"/>
<target host="*.googlevideo.com"/>
<exclusion pattern="^http://([^/@:\.]+)\.googlevideo\.com/crossdomain\.xml"/>
<!-- Not secured by server:
-->
<!--securecookie host="^\.youtube\.com$" name="^(GEUP|PREF|VISITOR_INFO1_LIVE|YSC)$" /-->
<!-- observed ^. cookies:
- use_hitbox
- VISITOR_INFO1_LIVE
- recently_watched_video_id_list
- .youtube.com -->
<securecookie host="^\.youtube\.com" name=".*"/>
<rule from="^http://(www\.)?youtube\.com/"
to="https://$1youtube.com/"/>
<rule from="^http://(br|de|es|fr|il|img|insight|jp|m|nl|uk)\.youtube\.com/"
to="https://$1.youtube.com/"/>
<rule from="^http://([^/@:\.]+)\.ytimg\.com/"
to="https://$1.ytimg.com/"/>
<rule from="^http://youtu\.be/"
to="https://youtu.be/"/>
<rule from="^http://(?:www\.)?youtube-nocookie\.com/"
to="https://www.youtube-nocookie.com/"/>
<rule from="^http://([^/@:\.]+)\.googlevideo\.com/"
to="https://$1.googlevideo.com/"/>
</ruleset>

View File

@ -4,6 +4,9 @@ server:
debug : False
request_timeout : 3.0 # seconds
base_url: False
themes_path : ""
default_theme : default
https_rewrite : True
engines:
- name : general_dummy

View File

@ -50,6 +50,9 @@ from searx.search import Search
from searx.query import Query
from searx.autocomplete import backends as autocomplete_backends
from urlparse import urlparse
import re
static_path, templates_path, themes =\
get_themes(settings['themes_path']
@ -206,16 +209,60 @@ def index():
if not search.paging and engines[result['engine']].paging:
search.paging = True
# check if HTTPS rewrite is required
if settings['server']['https_rewrite']\
and result['parsed_url'].scheme == 'http':
for http_regex, https_url in https_rules:
if http_regex.match(result['url']):
result['url'] = http_regex.sub(https_url, result['url'])
# TODO result['parsed_url'].scheme
skip_https_rewrite = False
# check if HTTPS rewrite is possible
for target, rules, exclusions in https_rules:
# check if target regex match with url
if target.match(result['url']):
# process exclusions
for exclusion in exclusions:
# check if exclusion match with url
if exclusion.match(result['url']):
skip_https_rewrite = True
break
# skip https rewrite if required
if skip_https_rewrite:
break
# process rules
for rule in rules:
try:
# TODO, precompile rule
p = re.compile(rule[0])
# rewrite url if possible
new_result_url = p.sub(rule[1], result['url'])
except:
break
# parse new url
new_parsed_url = urlparse(new_result_url)
# continiue if nothing was rewritten
if result['url'] == new_result_url:
continue
# get domainname from result
# TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de
# TODO, using publicsuffix instead of this rewrite rule
old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:])
new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:])
# check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules
if old_result_domainname == new_result_domainname:
# set new url
result['url'] = new_result_url
# target has matched, do not search over the other rules
break
# HTTPS rewrite
if search.request_data.get('format', 'html') == 'html':
if 'content' in result:
result['content'] = highlight_content(result['content'],