1
1
{-# LANGUAGE RecordWildCards, NamedFieldPuns, RecursiveDo #-}
2
+ {-# LANGUAGE TupleSections #-}
2
3
3
4
module Distribution.Server.Features.Sitemap (
4
5
SitemapFeature (.. )
@@ -25,7 +26,25 @@ import Data.ByteString.Lazy (ByteString)
25
26
import Data.Time.Clock (UTCTime (.. ), getCurrentTime )
26
27
import Data.Time.Calendar (showGregorian )
27
28
import Network.URI
29
+ import Control.DeepSeq
30
+ import Text.Read
31
+ import Data.List.Split
32
+ import Distribution.Server.Framework.BlobStorage
33
+ import Distribution.Server.Features.TarIndexCache
34
+ import qualified Data.TarIndex as Tar
35
+ import System.FilePath (takeExtension )
28
36
37
+ data Sitemap
38
+ = Sitemap
39
+ { sitemapIndex :: XMLResponse
40
+ , sitemaps :: [XMLResponse ]
41
+ }
42
+
43
+ instance NFData Sitemap where
44
+ rnf (Sitemap i s) = rnf i `seq` rnf s
45
+
46
+ instance MemSize Sitemap where
47
+ memSize (Sitemap i s) = memSize2 i s
29
48
30
49
data SitemapFeature = SitemapFeature {
31
50
sitemapFeatureInterface :: HackageFeature
@@ -38,6 +57,7 @@ initSitemapFeature :: ServerEnv
38
57
-> IO ( CoreFeature
39
58
-> DocumentationFeature
40
59
-> TagsFeature
60
+ -> TarIndexCacheFeature
41
61
-> IO SitemapFeature )
42
62
43
63
initSitemapFeature env@ ServerEnv { serverCacheDelay,
@@ -46,10 +66,11 @@ initSitemapFeature env@ServerEnv{ serverCacheDelay,
46
66
47
67
return $ \ coref@ CoreFeature {.. }
48
68
docsCore@ DocumentationFeature {.. }
49
- tagsf@ TagsFeature {.. } -> do
69
+ tagsf@ TagsFeature {.. }
70
+ tarf@ TarIndexCacheFeature {.. } -> do
50
71
51
72
rec let (feature, updateSitemapCache) =
52
- sitemapFeature env coref docsCore tagsf
73
+ sitemapFeature env coref docsCore tagsf tarf
53
74
initTime sitemapCache
54
75
55
76
sitemapCache <- newAsyncCacheNF updateSitemapCache
@@ -66,63 +87,85 @@ sitemapFeature :: ServerEnv
66
87
-> CoreFeature
67
88
-> DocumentationFeature
68
89
-> TagsFeature
90
+ -> TarIndexCacheFeature
69
91
-> UTCTime
70
- -> AsyncCache XMLResponse
71
- -> (SitemapFeature , IO XMLResponse )
92
+ -> AsyncCache Sitemap
93
+ -> (SitemapFeature , IO Sitemap )
72
94
sitemapFeature ServerEnv {.. }
73
95
CoreFeature {.. }
74
96
DocumentationFeature {.. }
75
97
TagsFeature {.. }
98
+ TarIndexCacheFeature {cachedTarIndex}
76
99
initTime
77
100
sitemapCache
78
101
= (SitemapFeature {.. }, updateSitemapCache)
79
102
where
80
103
81
104
sitemapFeatureInterface = (emptyHackageFeature " sitemap" ) {
82
- featureResources = [ xmlSitemapResource ]
105
+ featureResources = [ xmlSitemapIndexResource, xmlSitemapResource ]
83
106
, featureState = []
84
- , featureDesc = " Provides a sitemap.xml for search engines"
107
+ , featureDesc = " Provides sitemap for search engines"
85
108
, featureCaches =
86
109
[ CacheComponent {
87
- cacheDesc = " sitemap.xml " ,
110
+ cacheDesc = " sitemap" ,
88
111
getCacheMemSize = memSize <$> readAsyncCache sitemapCache
89
112
}
90
113
]
91
114
, featurePostInit = do
92
115
syncAsyncCache sitemapCache
93
116
addCronJob serverCron CronJob {
94
- cronJobName = " regenerate the cached sitemap.xml " ,
117
+ cronJobName = " regenerate the cached sitemap" ,
95
118
cronJobFrequency = DailyJobFrequency ,
96
119
cronJobOneShot = False ,
97
120
cronJobAction = prodAsyncCache sitemapCache " cron"
98
121
}
99
122
}
100
123
124
+ xmlSitemapIndexResource :: Resource
125
+ xmlSitemapIndexResource = (resourceAt " /sitemap_index.xml" ) {
126
+ resourceDesc = [(GET , " The dynamically generated sitemap index, in XML format" )]
127
+ , resourceGet = [(" xml" , serveSitemapIndex)]
128
+ }
129
+
101
130
xmlSitemapResource :: Resource
102
- xmlSitemapResource = (resourceAt " /sitemap.xml " ) {
131
+ xmlSitemapResource = (resourceAt " /sitemap/:filename " ) {
103
132
resourceDesc = [(GET , " The dynamically generated sitemap, in XML format" )]
104
133
, resourceGet = [(" xml" , serveSitemap)]
105
134
}
106
135
107
- serveSitemap :: DynamicPath -> ServerPartE Response
108
- serveSitemap _ = do
109
- sitemapXML <- liftIO $ readAsyncCache sitemapCache
136
+ serveSitemapIndex :: DynamicPath -> ServerPartE Response
137
+ serveSitemapIndex _ = do
138
+ Sitemap { .. } <- liftIO $ readAsyncCache sitemapCache
110
139
cacheControlWithoutETag [Public , maxAgeDays 1 ]
111
- return (toResponse sitemapXML)
140
+ return (toResponse sitemapIndex)
141
+
142
+ serveSitemap :: DynamicPath -> ServerPartE Response
143
+ serveSitemap dpath =
144
+ case lookup " filename" dpath of
145
+ Just filename
146
+ | [basename, " xml" ] <- splitOn " ." filename
147
+ , Just i <- readMaybe basename -> do
148
+ Sitemap {.. } <- liftIO $ readAsyncCache sitemapCache
149
+ guard (i < length sitemaps)
150
+ cacheControlWithoutETag [Public , maxAgeDays 1 ]
151
+ return (toResponse (sitemaps !! i))
152
+ _ -> mzero
112
153
113
154
-- Generates a list of sitemap entries corresponding to hackage pages, then
114
155
-- builds and returns an XML sitemap.
115
- updateSitemapCache :: IO XMLResponse
156
+ updateSitemapCache :: IO Sitemap
116
157
updateSitemapCache = do
117
158
118
159
alltags <- queryGetTagList
119
160
pkgIndex <- queryGetPackageIndex
120
161
docIndex <- queryDocumentationIndex
121
162
122
- let sitemap = generateSitemap serverBaseURI pageBuildDate
163
+ sitemaps <- generateSitemap serverBaseURI pageBuildDate
123
164
(map fst alltags)
124
- pkgIndex docIndex
125
- return (XMLResponse sitemap)
165
+ pkgIndex docIndex cachedTarIndex
166
+ let uriScheme i = " /sitemap/" <> show i <> " .xml"
167
+ sitemapIndex = renderSitemapIndex serverBaseURI (map uriScheme [0 .. (length sitemaps - 1 )])
168
+ return $ Sitemap (XMLResponse sitemapIndex) (map XMLResponse sitemaps)
126
169
127
170
pageBuildDate :: T. Text
128
171
pageBuildDate = T. pack (showGregorian (utctDay initTime))
@@ -131,19 +174,21 @@ generateSitemap :: URI
131
174
-> T. Text
132
175
-> [Tag ]
133
176
-> PackageIndex. PackageIndex PkgInfo
134
- -> Map. Map PackageId a
135
- -> ByteString
136
- generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
137
- renderSitemap serverBaseURI allEntries
177
+ -> Map. Map PackageId BlobId
178
+ -> (BlobId -> IO Tar. TarIndex )
179
+ -> IO [ByteString ]
180
+ generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex cachedTarIndex = do
181
+ versionedDocSubEntries <- versionedDocSubEntriesIO
182
+ let -- Combine and build sitemap
183
+ allEntries = miscEntries
184
+ ++ tagEntries
185
+ ++ nameEntries
186
+ ++ nameVersEntries
187
+ ++ baseDocEntries
188
+ ++ versionedDocEntries
189
+ ++ versionedDocSubEntries
190
+ pure $ renderSitemap serverBaseURI <$> chunksOf 50000 allEntries
138
191
where
139
- -- Combine and build sitemap
140
- allEntries = miscEntries
141
- ++ tagEntries
142
- ++ nameEntries
143
- ++ nameVersEntries
144
- ++ baseDocEntries
145
- ++ versionedDocEntries
146
-
147
192
-- Misc. pages
148
193
-- e.g. ["http://myhackage.com/index", ...]
149
194
miscEntries = urlsToSitemapEntries miscPages pageBuildDate Weekly 0.75
@@ -224,3 +269,26 @@ generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
224
269
, Map. member (packageId pkg) docIndex
225
270
]
226
271
pageBuildDate Monthly 0.25
272
+
273
+ -- Versioned doc pages in subdirectories
274
+ -- versionedSubDocURIs :: [path :: String]
275
+ -- e.g. ["http://myhackage.com/packages/mypackage-1.0.2/docs/Lib.html", ...]
276
+ versionedDocSubEntriesIO = do
277
+ let pkgs = [ (pkg , blob)
278
+ | pkg <- concat pkgss
279
+ , Just blob <- [Map. lookup (packageId pkg) docIndex]
280
+ ]
281
+ pkgIndices <- traverse (\ (pkg, blob) -> (pkg,) <$> cachedTarIndex blob) pkgs
282
+ pure $ urlsToSitemapEntries
283
+ [ prefixPkgURI ++ display (packageId pkg) ++ " /docs" ++ fp
284
+ | (pkg, tarIndex) <- pkgIndices
285
+ , Just tar <- [Tar. lookup tarIndex " " ]
286
+ , fp <- entryToPaths " /" tar
287
+ , takeExtension fp == " .html"
288
+ ]
289
+ pageBuildDate Monthly 0.25
290
+
291
+ entryToPaths :: FilePath -> Tar. TarIndexEntry -> [FilePath ]
292
+ entryToPaths _ (Tar. TarFileEntry _) = []
293
+ entryToPaths base (Tar. TarDir content) = map ((base </> ) . fst ) content ++
294
+ [ file | (folder, entry) <- content, file <- entryToPaths (base </> folder) entry ]
0 commit comments